From 254e42abaa26458f085b79bbcd8fff60633f777a Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 09:22:36 +0300 Subject: [PATCH 01/31] Merge branches 'master' and 'master' of git://github.com/millasub/DDalphaAMG into TM2p1p1 From d666fd880271e6fa538b3f8e292673302e21c7df Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 09:25:11 +0300 Subject: [PATCH 02/31] WIP: moving vector_PRECION from array to structure --- build/.gitignore | 3 + src/DDalphaAMG_interface.c | 133 ++--- src/coarse_oddeven_generic.c | 402 +++++++------- src/coarse_oddeven_generic.h | 20 +- src/coarse_operator_generic.c | 356 ++++++------ src/coarse_operator_generic.h | 408 +++++++------- src/coarsening_generic.c | 2 +- src/data_generic.c | 6 +- src/data_generic.h | 4 +- src/dirac.c | 45 +- src/dirac_generic.c | 479 ++++++++-------- src/dirac_generic.h | 152 ++--- src/gathering_generic.c | 102 ++-- src/gathering_generic.h | 4 +- src/ghost_generic.c | 106 ++-- src/ghost_generic.h | 10 +- src/init.c | 4 +- src/init_generic.c | 81 +-- src/interpolation_generic.c | 80 +-- src/interpolation_generic.h | 6 +- src/io.c | 27 +- src/linalg.c | 8 +- src/linalg.h | 8 +- src/linalg_generic.c | 164 +++--- src/linalg_generic.h | 34 +- src/linsolve.c | 78 +-- src/linsolve.h | 4 +- src/linsolve_generic.c | 308 +++++------ src/linsolve_generic.h | 6 +- src/main.h | 2 + src/main_post_def_generic.h | 14 +- src/main_pre_def_generic.h | 11 +- src/oddeven_generic.c | 893 +++++++++++++++--------------- src/oddeven_generic.h | 28 +- src/operator_generic.c | 46 +- src/preconditioner.c | 22 +- src/preconditioner.h | 2 +- src/schwarz_generic.c | 404 +++++++------- src/schwarz_generic.h | 20 +- src/setup_generic.c | 115 ++-- src/setup_generic.h | 2 +- src/sse_coarse_operator_generic.c | 4 +- src/sse_coarse_operator_generic.h | 8 +- src/sse_interpolation_generic.c | 82 +-- src/sse_interpolation_generic.h | 6 +- src/sse_linalg.c | 58 +- src/sse_linalg_generic.c | 10 +- src/sse_linalg_generic.h | 2 +- src/top_level.c | 43 +- src/top_level.h | 6 +- src/var_table.h | 28 +- src/vcycle_generic.c | 38 +- src/vcycle_generic.h | 4 +- src/vector_generic.c | 89 +++ src/vector_generic.h | 37 ++ 55 files changed, 2616 insertions(+), 2398 deletions(-) create mode 100644 src/vector_generic.c create mode 100644 src/vector_generic.h diff --git a/build/.gitignore b/build/.gitignore index 5761abc..35d987b 100644 --- a/build/.gitignore +++ b/build/.gitignore @@ -1 +1,4 @@ +* *.o +!gsrc +!.gitignore \ No newline at end of file diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c index 2bc3f82..edc54e7 100644 --- a/src/DDalphaAMG_interface.c +++ b/src/DDalphaAMG_interface.c @@ -570,7 +570,7 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) { } } -static inline void vector_copy( vector_double vector_out, vector_double vector_in ) +static inline void vector_copy( vector_double *vector_out, vector_double *vector_in ) { THREADED(threading[0]->n_core) { int start = threading[omp_get_thread_num()]->start_index[0], @@ -591,7 +591,7 @@ static inline void solver( ) } } -static inline void correct_guess( vector_double guess, vector_double solution, vector_double solution2, +static inline void correct_guess( vector_double *guess, vector_double *solution, vector_double *solution2, double even_dshift, double odd_dshift ) { // guess = D^{-1}*rhs - i*dshift*D^{-2}*rhs @@ -666,8 +666,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d complex_double twisted_bc, tmp1, tmp2; double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2; gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); - vector_double vb=p->b, rhs = p->b; - vector_double vx=p->x, sol = p->x; + buffer_double vb=p->b.vector_buffer, vx=p->x.vector_buffer; + vector_double *rhs = &(p->b), *sol = &(p->x); DDalphaAMG_status tmp_status; double t0, t1; @@ -717,33 +717,33 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif if(p->initial_guess_zero == 0) { #ifndef BASIS4 - sol[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; - sol[j+6] = ((complex_double)vector2_out[i+2*(k+3*mu)] + I*(complex_double)vector2_out[i+2*(k+3*mu)+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; + sol->vector_buffer[j+6] = ((complex_double)vector2_out[i+2*(k+3*mu)] + I*(complex_double)vector2_out[i+2*(k+3*mu)+1]) * twisted_bc; #else - sol[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; - sol[j+6] = ((complex_double)vector2_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; + sol->vector_buffer[j+6] = ((complex_double)vector2_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif } #ifndef INIT_ONE_PREC if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); + vtmp=cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; - vtmp=cabs(rhs[j+6]); + vtmp=cabs(rhs->vector_buffer[j+6]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) @@ -759,23 +759,23 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif if(p->initial_guess_zero == 0) { #ifndef BASIS4 - sol[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; #else - sol[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif } #ifndef INIT_ONE_PREC if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); + vtmp=cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) @@ -803,10 +803,10 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d g.mixed_precision=1; p = &(g.p); // storing pointer in x and b - vb = p->b; - vx = p->x; - p->b = g.p_MP.dp.b; - p->x = g.p_MP.dp.x; + vb = p->b.vector_buffer; + vx = p->x.vector_buffer; + p->b.vector_buffer = g.p_MP.dp.b.vector_buffer; + p->x.vector_buffer = g.p_MP.dp.x.vector_buffer; p->tol = g.p_MP.dp.tol; } else precision_changed = 0; #endif @@ -984,8 +984,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d if(g.n_flavours==2) { for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; - tmp2 = sol[j+6] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; + tmp2 = sol->vector_buffer[j+6] * twisted_bc; #ifndef BASIS4 vector1_out[i+2*(k+3*mu)] = creal(tmp1); vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1005,7 +1005,7 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d #endif for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; #ifndef BASIS4 vector1_out[i+2*(k+3*mu)] = creal(tmp1); vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1023,8 +1023,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d if (precision_changed) { g.mixed_precision=2; // recovering pointer from x and b - p->b = vb; - p->x = vx; + p->b.vector_buffer = vb; + p->x.vector_buffer = vx; } #endif @@ -1049,9 +1049,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2; gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); - vector_double vb, rhs = p->b; - vector_double vx, sol = p->x; - vector_double source = NULL, solution = NULL, solution2 = NULL; + buffer_double vb, vx; + vector_double *rhs =&(p->b), *sol = &(p->x); + vector_double *source=NULL, *solution=NULL, *solution2=NULL; + DDalphaAMG_status tmp_status; double t0, t1; @@ -1102,22 +1103,22 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif #ifndef INIT_ONE_PREC if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); + vtmp=cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; - vtmp=cabs(rhs[j+6]); + vtmp=cabs(rhs->vector_buffer[j+6]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) @@ -1133,14 +1134,14 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif #ifndef INIT_ONE_PREC if( g.mixed_precision == 2 ) { - vtmp = cabs(rhs[j]); + vtmp = cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax = vtmp; if( vtmp > EPS_double && vtmp < vmin ) @@ -1168,8 +1169,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i g.mixed_precision=1; p = &(g.p); // storing pointer in x and b - vb = p->b; - vx = p->x; + vb = p->b.vector_buffer; + vx = p->x.vector_buffer; p->b = g.p_MP.dp.b; p->x = g.p_MP.dp.x; p->tol = g.p_MP.dp.tol; @@ -1181,10 +1182,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i ASSERT( odd_shifts != NULL ); } if ( n_shifts > 1 ) { - MALLOC( source, complex_double, l.inner_vector_size ); - MALLOC( solution, complex_double, l.inner_vector_size ); + MALLOC( source->vector_buffer, complex_double, l.inner_vector_size ); + MALLOC( solution->vector_buffer, complex_double, l.inner_vector_size ); if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) - MALLOC( solution2, complex_double, l.inner_vector_size ); + MALLOC( solution2->vector_buffer, complex_double, l.inner_vector_size ); } for ( n = 0; n < n_shifts; n++ ) { @@ -1449,8 +1450,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i if(g.n_flavours==2) { for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; - tmp2 = sol[j+6] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; + tmp2 = sol->vector_buffer[j+6] * twisted_bc; #ifndef BASIS4 vector1_out[n][i+2*(k+3*mu)] = creal(tmp1); vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1470,7 +1471,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i #endif for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; #ifndef BASIS4 vector1_out[n][i+2*(k+3*mu)] = creal(tmp1); vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1488,10 +1489,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->initial_guess_zero = 1; if ( n_shifts > 0 ) { - FREE( source, complex_double, l.inner_vector_size ); - FREE( solution, complex_double, l.inner_vector_size ); + FREE( source->vector_buffer, complex_double, l.inner_vector_size ); + FREE( solution->vector_buffer, complex_double, l.inner_vector_size ); if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) - FREE( solution2, complex_double, l.inner_vector_size ); + FREE( solution2->vector_buffer, complex_double, l.inner_vector_size ); } @@ -1499,8 +1500,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i if (precision_changed) { g.mixed_precision=2; // recovering pointer from x and b - p->b = vb; - p->x = vx; + p->b.vector_buffer = vb; + p->x.vector_buffer = vx; } #endif @@ -1533,8 +1534,8 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in from=ltmp->next_level; to=ltmp; } - vector_float rhs = from->p_float.b; - vector_float sol = to->p_float.x; + vector_float *rhs = &(from->p_float.b); + vector_float *sol = &(to->p_float.x); double t0, t1; t0 = MPI_Wtime(); @@ -1559,7 +1560,7 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in i = 2*j; for ( mu=0; munum_lattice_site_var; mu++, j++ ) - rhs[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]); + rhs->vector_buffer[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]); } switch(_TYPE) { @@ -1596,8 +1597,8 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in i = 2*j; for ( mu=0; munum_lattice_site_var; mu++, j++ ) { - vector_out[i+2*mu] = (double) creal(sol[j]); - vector_out[i+2*mu+1] = (double) cimag(sol[j]); + vector_out[i+2*mu] = (double) creal(sol->vector_buffer[j]); + vector_out[i+2*mu+1] = (double) cimag(sol->vector_buffer[j]); } } @@ -1839,7 +1840,9 @@ void DDalphaAMG_define_vector_const( double *vector, double re, double im ) { if(vector!=NULL){ int start, end; compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_define( (vector_double) vector, re+I*im, start, end, &l ); + vector_double vec; + vec.vector_buffer= (buffer_double) vector; + vector_double_define( &vec, re+I*im, start, end, &l ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1852,7 +1855,9 @@ void DDalphaAMG_define_vector_rand( double *vector ) { if(vector!=NULL){ int start, end; compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_define_random( (vector_double) vector, start, end, &l ); + vector_double vec; + vec.vector_buffer= (buffer_double) vector; + vector_double_define_random( &vec, start, end, &l ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1865,7 +1870,9 @@ double DDalphaAMG_vector_norm( double *vector ) { double norm = 0; THREADED(threading[0]->n_core) if(vector!=NULL){ - norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); + vector_double vec; + vec.vector_buffer = (buffer_double) vector; + norm = global_norm_double( &vec, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1880,7 +1887,9 @@ void DDalphaAMG_vector_saxpy( double *vector_out, double a, double *x, double *y if(vector_out!=NULL && x!=NULL && y!=NULL){ int start, end; compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, start, end, &l ); + vector_double vec_out, xx, yy; + vec_out.vector_buffer= (buffer_double) vector_out; xx.vector_buffer= (buffer_double) x; yy.vector_buffer= (buffer_double) y; + vector_double_saxpy( &vec_out, &xx, &yy, a, start, end, &l ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c index b0baa6a..615ba9d 100644 --- a/src/coarse_oddeven_generic.c +++ b/src/coarse_oddeven_generic.c @@ -254,47 +254,47 @@ void coarse_selfcoupling_LU_doublet_decomposition_PRECISION( config_PRECISION ou #endif -void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION A, level_struct *l ) { +void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION A, level_struct *l ) { register int i, j, n2 = l->num_lattice_site_var; // solve x = U^(-1) L^(-1) b // forward substitution with L for ( i=0; ivector_buffer[i] = b->vector_buffer[i]; for ( j=0; jvector_buffer[i] = x->vector_buffer[i] - A[i*n2+j]*x->vector_buffer[j]; } } // backward substitution with U for ( i=n2-1; i>=0; i-- ) { for ( j=i+1; jvector_buffer[i] = x->vector_buffer[i] - A[i*n2+j]*x->vector_buffer[j]; } - x[i] = x[i]/A[i*(n2+1)]; + x->vector_buffer[i] = x->vector_buffer[i]/A[i*(n2+1)]; } } -void coarse_LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION A, level_struct *l ) { +void coarse_LU_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION A, level_struct *l ) { register int i, j, n2 = l->num_lattice_site_var; // y = Ax // multiplication with U for ( i=0; ivector_buffer[i] = A[i*(n2+1)]*x->vector_buffer[i]; for ( j=i+1; jvector_buffer[i] += A[i*n2+j]*x->vector_buffer[j]; } // multiplication with L for ( i=n2-1; i>0; i-- ) for ( j=0; jvector_buffer[i] += A[i*n2+j]*y->vector_buffer[j]; } -void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; compute_core_start_end_custom( 0, op->num_even_sites, &start, &end, l, threading, 1 ); @@ -306,7 +306,7 @@ void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_ #endif } -void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; #ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION @@ -320,14 +320,14 @@ void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_ compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); - x += num_site_var*(op->num_even_sites+start); - y += num_site_var*(op->num_even_sites+start); + x->vector_buffer += num_site_var*(op->num_even_sites+start); + y->vector_buffer += num_site_var*(op->num_even_sites+start); sc += oo_inv_size*start; for ( int i=start; ivector_buffer += num_site_var; + y->vector_buffer += num_site_var; sc += oo_inv_size; } @@ -337,13 +337,13 @@ void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_ #endif } -void coarse_diag_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l ) { +void coarse_diag_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l ) { coarse_diag_ee_PRECISION( y, x, op, l, no_threading ); coarse_diag_oo_PRECISION( y, x, op, l, no_threading ); } -void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; @@ -369,8 +369,8 @@ void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, opera #endif #endif - x += num_site_var*(op->num_even_sites+start); - y += num_site_var*(op->num_even_sites+start); + x->vector_buffer += num_site_var*(op->num_even_sites+start); + y->vector_buffer += num_site_var*(op->num_even_sites+start); sc += oo_inv_size*start; for ( int i=start; ivector_buffer[j] = _COMPLEX_PRECISION_ZERO; cgemv( num_site_var, sc, lda, (float *)x, (float *)y); #endif - x += num_site_var; - y += num_site_var; + x->vector_buffer += num_site_var; + y->vector_buffer += num_site_var; sc += oo_inv_size; } } @@ -444,14 +444,14 @@ void coarse_oddeven_alloc_PRECISION( level_struct *l ) { operator_PRECISION_alloc( op, _ODDEVEN, l ); // buffers - MALLOC( op->buffer, complex_PRECISION*, 2 ); - op->buffer[0] = NULL; + MALLOC( op->buffer, vector_PRECISION, 2 ); + vector_PRECISION_init(&(op->buffer[0])); #ifdef HAVE_TM1p1 - MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size ); - op->buffer[1] = op->buffer[0] + 2*l->vector_size; + MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 4*l->vector_size ); + op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + 2*l->vector_size; #else - MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); - op->buffer[1] = op->buffer[0] + l->vector_size; + MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 2*l->vector_size ); + op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + l->vector_size; #endif for ( mu=0; mu<4; mu++ ) { @@ -625,15 +625,15 @@ void coarse_oddeven_free_PRECISION( level_struct *l ) { #endif #ifdef HAVE_TM1p1 - FREE( op->buffer[0], complex_PRECISION, 4*vs ); + FREE( op->buffer[0].vector_buffer, complex_PRECISION, 4*vs ); #else - FREE( op->buffer[0], complex_PRECISION, 2*vs ); + FREE( op->buffer[0].vector_buffer, complex_PRECISION, 2*vs ); #endif - FREE( op->buffer, complex_PRECISION*, 2 ); + FREE( op->buffer, vector_PRECISION, 2 ); } -void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) @@ -646,6 +646,9 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o vector_PRECISION in_pt, out_pt; config_PRECISION D_pt; + in_pt = *in; + out_pt = *out; + int core_start; int core_end; @@ -665,7 +668,7 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_MASTER(threading) @@ -681,49 +684,49 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o // compute U_mu^dagger coupling for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -738,30 +741,30 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o // compute U_mu couplings for ( i=core_start; ineighbor_table[index]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index]; index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -770,7 +773,7 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o } -void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION @@ -791,6 +794,8 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; vector_PRECISION in_pt, out_pt; config_PRECISION D_pt; + in_pt = *in; + out_pt = *out; int core_start; int core_end; @@ -811,7 +816,7 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_MASTER(threading) @@ -827,49 +832,49 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, // compute U_mu^dagger coupling for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -884,30 +889,30 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, // compute U_mu couplings for ( i=core_start; ineighbor_table[index]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index]; index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -917,7 +922,7 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, } -void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION @@ -927,6 +932,8 @@ void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PREC start=0, num_lattice_sites=l->num_inner_lattice_sites, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; vector_PRECISION in_pt, out_pt; + in_pt = *in; + out_pt = *out; OPERATOR_TYPE_PRECISION *D_vectorized; int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); @@ -951,7 +958,7 @@ void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PREC if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_MASTER(threading) @@ -967,49 +974,49 @@ void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PREC // compute U_mu^dagger coupling for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -1024,30 +1031,30 @@ void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PREC // compute U_mu couplings for ( i=core_start; ineighbor_table[index]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - in_pt = in + num_site_var*op->neighbor_table[index+Y]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - in_pt = in + num_site_var*op->neighbor_table[index+X]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -1057,7 +1064,7 @@ void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PREC } -void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, int sign, struct Thread *threading ) { #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION @@ -1067,6 +1074,8 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P start=0, num_lattice_sites=l->num_inner_lattice_sites, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; vector_PRECISION in_pt, out_pt; + in_pt = *in; + out_pt = *out; OPERATOR_TYPE_PRECISION *D_vectorized; int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); @@ -1077,7 +1086,7 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P int core_start; int core_end; - void (*coarse_hopp)(vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, level_struct *l); + void (*coarse_hopp)(vector_PRECISION *eta, vector_PRECISION *phi, OPERATOR_TYPE_PRECISION *D, level_struct *l); if(sign == +1) coarse_hopp = coarse_hopp_PRECISION_vectorized; else @@ -1098,7 +1107,7 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P START_MASTER(threading) for ( mu=0; mu<4; mu++ ) { // send in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } END_MASTER(threading) @@ -1114,20 +1123,20 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P for(int mu=0; mu<4; mu++) { if(neighbor_fw[5*i+1+mu] < l->num_inner_lattice_sites) continue; - out_pt = out + num_site_var*neighbor_fw[5*i+1+mu]; - in_pt = in + num_site_var*neighbor_fw[5*i]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*neighbor_fw[5*i+1+mu]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*neighbor_fw[5*i]; D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); + coarse_hopp( &out_pt, &in_pt, D_vectorized, l ); } } START_LOCKED_MASTER(threading) for ( mu=0; mu<4; mu++ ) { // send in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } END_LOCKED_MASTER(threading) } @@ -1145,7 +1154,7 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P // assumptions (1) self coupling has already been performed // OR (2) "out" is initialized with zeros for ( i=core_start; ivector_buffer + num_site_var*neighbor_fw[5*i]; // U_mu^dagger coupling for(int mu=0; mu<4; mu++) { @@ -1153,15 +1162,15 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P if(neighbor_bw[5*i+1+mu] >= l->num_inner_lattice_sites) continue; D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_bw[5*i+1+mu] + mu*link_offset; - in_pt = in + num_site_var*neighbor_bw[5*i+1+mu]; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*neighbor_bw[5*i+1+mu]; + coarse_hopp( &out_pt, &in_pt, D_vectorized, l ); } // compute U_mu couplings for(int mu=0; mu<4; mu++) { D_vectorized = op->D_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - in_pt = in + num_site_var*neighbor_fw[5*i+1+mu]; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*neighbor_fw[5*i+1+mu]; + coarse_hopp( &out_pt, &in_pt, D_vectorized, l ); } } @@ -1171,7 +1180,7 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P START_LOCKED_MASTER(threading) for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } END_LOCKED_MASTER(threading) } @@ -1183,7 +1192,7 @@ void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_P } -void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION @@ -1193,6 +1202,8 @@ void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PR start=0, num_lattice_sites=l->num_inner_lattice_sites, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; vector_PRECISION in_pt, out_pt; + in_pt = *in; + out_pt = *out; OPERATOR_TYPE_PRECISION *D_vectorized; int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); @@ -1217,7 +1228,7 @@ void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PR if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_MASTER(threading) @@ -1235,49 +1246,49 @@ void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PR // compute U_mu^dagger coupling for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -1292,30 +1303,30 @@ void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PR // compute U_mu couplings for ( i=core_start; ineighbor_table[index]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index]; D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -1329,26 +1340,26 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &p->b, &p->x, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); fgmres_PRECISION( p, l, threading ); // even to odd PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &p->b, &p->x, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) } -void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { // start and end indices for vector functions depending on thread int start; @@ -1364,16 +1375,16 @@ void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECI coarse_diag_ee_PRECISION( out, in, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start, end, l ); + vector_PRECISION_define( &tmp[0], 0, start, end, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + coarse_hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading ); + coarse_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( out, &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); } @@ -1387,47 +1398,47 @@ void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PR vector_PRECISION tmp = op->buffer[0]; SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start_even, end_even, l ); + vector_PRECISION_define( &tmp, 0, start_even, end_even, l ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l ); SYNC_CORES(threading) - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l ); PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &tmp, &p->x, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); - coarse_gamma5_PRECISION( tmp, tmp, start_even, end_even, l ); + coarse_gamma5_PRECISION( &tmp, &tmp, start_even, end_even, l ); SYNC_CORES(threading) - vector_PRECISION_plus( p->b, p->b, tmp, start_even, end_even, l ); + vector_PRECISION_plus( &p->b, &p->b, &tmp, start_even, end_even, l ); fgmres_PRECISION( p, l, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l ); SYNC_CORES(threading) - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); SYNC_CORES(threading) // even to odd PROF_PRECISION_START( _NC, threading ); - vector_PRECISION_define( tmp, 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp, 0, start_odd, end_odd, l ); SYNC_CORES(threading) - coarse_n_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &tmp, &p->x, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( p->b, tmp, op, l, threading ); - vector_PRECISION_plus( p->x, p->x, p->b, start_odd, end_odd, l ); + coarse_diag_oo_inv_PRECISION( &p->b, &tmp, op, l, threading ); + vector_PRECISION_plus( &p->x, &p->x, &p->b, start_odd, end_odd, l ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) } -void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start_even, end_even, start_odd, end_odd; compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var ); @@ -1440,16 +1451,16 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P coarse_diag_ee_PRECISION( out, in, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + coarse_hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading ); + coarse_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( out, &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) coarse_gamma5_PRECISION( out, out, start_even, end_even, l ); @@ -1457,52 +1468,55 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P } -void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void coarse_odd_even_PRECISION_test( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { if ( g.odd_even ) { - vector_PRECISION buf1 = NULL, buf2 = NULL; + vector_PRECISION buf1, buf2; - PUBLIC_MALLOC( buf1, complex_PRECISION, 2*l->vector_size ); - buf2 = buf1 + l->vector_size; + vector_PRECISION_init(&buf1); + vector_PRECISION_init(&buf2); + + PUBLIC_MALLOC( buf1.vector_buffer, complex_PRECISION, 2*l->vector_size ); + buf2.vector_buffer = buf1.vector_buffer + l->vector_size; START_LOCKED_MASTER(threading) // transformation part - vector_PRECISION_copy( buf1, in, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &buf1, in, 0, l->inner_vector_size, l ); // even to odd vector_PRECISION_define( out, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) - coarse_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); - coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading ); + coarse_hopping_term_PRECISION( out, &buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); + coarse_diag_oo_inv_PRECISION( &buf2, out, &(l->oe_op_PRECISION), l, threading ); START_LOCKED_MASTER(threading) - vector_PRECISION_plus( buf1, buf1, buf2, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_plus( &buf1, &buf1, &buf2, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) // block diagonal part if ( g.method == 6 ) { - g5D_coarse_apply_schur_complement_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading ); + g5D_coarse_apply_schur_complement_PRECISION( out, &buf1, &(l->oe_op_PRECISION), l, threading ); } else { - coarse_apply_schur_complement_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading ); + coarse_apply_schur_complement_PRECISION( out, &buf1, &(l->oe_op_PRECISION), l, threading ); } - coarse_diag_oo_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading ); + coarse_diag_oo_PRECISION( out, &buf1, &(l->oe_op_PRECISION), l, threading ); // back transformation part - coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading ); + coarse_diag_oo_inv_PRECISION( &buf2, out, &(l->oe_op_PRECISION), l, threading ); if ( g.method == 6 ) { START_LOCKED_MASTER(threading) coarse_gamma5_PRECISION( out, out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - vector_PRECISION_define( buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - coarse_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); - coarse_gamma5_PRECISION( buf1, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - vector_PRECISION_plus( out, out, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + vector_PRECISION_define( &buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + coarse_hopping_term_PRECISION( &buf1, &buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + coarse_gamma5_PRECISION( &buf1, &buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + vector_PRECISION_plus( out, out, &buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); END_LOCKED_MASTER(threading) } else { - coarse_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); + coarse_hopping_term_PRECISION( out, &buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); } - PUBLIC_FREE( buf1, complex_PRECISION, 2*l->vector_size ); + PUBLIC_FREE( buf1.vector_buffer, complex_PRECISION, 2*l->vector_size ); } } diff --git a/src/coarse_oddeven_generic.h b/src/coarse_oddeven_generic.h index e1481be..85f74c6 100644 --- a/src/coarse_oddeven_generic.h +++ b/src/coarse_oddeven_generic.h @@ -34,30 +34,30 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, + void coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, + void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, int sign, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1, + void coarse_odd_even_PRECISION_test( vector_PRECISION *c4, vector_PRECISION *c1, level_struct *l, struct Thread *threading ); - void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); #endif diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index 33338d3..b7f8c53 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -70,7 +70,8 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { double t0, t1; t0 = MPI_Wtime(); - vector_PRECISION buffer1 = l->vbuf_PRECISION[4], buffer2 = l->vbuf_PRECISION[5]; + vector_PRECISION buffer1, buffer2; + buffer1.vector_buffer = l->vbuf_PRECISION[4].vector_buffer; buffer2.vector_buffer = l->vbuf_PRECISION[5].vector_buffer; int mu, n = l->num_eig_vect, i, j, D_size = l->next_level->D_size, @@ -93,22 +94,22 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { for ( i=0; is_PRECISION.op.c), l ); + negative_sendrecv_PRECISION( &V[i], mu, &(l->s_PRECISION.op.c), l ); } // apply self coupling of block-and-2spin-restricted dirac operator for each aggregate - aggregate_self_coupling( buffer1, buffer2, V[i], &(l->s_PRECISION), l ); + aggregate_self_coupling( &buffer1, &buffer2, &V[i], &(l->s_PRECISION), l ); // calculate selfcoupling entries of the coarse grid operator - set_coarse_self_coupling_PRECISION( buffer1, buffer2, V, i, l ); + set_coarse_self_coupling_PRECISION( &buffer1, &buffer2, V, i, l ); //odd_proj - aggregate_block( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l ); - set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l ); + aggregate_block( &buffer1, &buffer2, &V[i], l->s_PRECISION.op.odd_proj, l ); + set_block_diagonal_PRECISION( &buffer1, &buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l ); for ( mu=0; mu<4; mu++ ) { // finish updating ghostcells of V[i] negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); // apply 2spin-restricted dirac operator for direction mu for all aggregates - aggregate_neighbor_coupling( buffer1, buffer2, V[i], mu, &(l->s_PRECISION), l ); - set_coarse_neighbor_coupling_PRECISION( buffer1, buffer2, V, mu, i, l ); + aggregate_neighbor_coupling( &buffer1, &buffer2, &V[i], mu, &(l->s_PRECISION), l ); + set_coarse_neighbor_coupling_PRECISION( &buffer1, &buffer2, V, mu, i, l ); } } @@ -129,7 +130,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t PRECISION mf = (g.mu_factor[l->depth]) ? g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth]:0; if ( mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_even_shift == 0 && mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_odd_shift == 0 ) - vector_PRECISION_define( l->next_level->op_PRECISION.tm_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); + buffer_PRECISION_define( l->next_level->op_PRECISION.tm_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); else tm_term_PRECISION_setup( mf*l->s_PRECISION.op.mu, mf*l->s_PRECISION.op.mu_even_shift, mf*l->s_PRECISION.op.mu_odd_shift, &(l->next_level->op_PRECISION), @@ -140,7 +141,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t PRECISION ef = (g.epsbar_factor[l->depth]) ? g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth]:0; if ( ef*l->s_PRECISION.op.epsbar == 0 && ef*l->s_PRECISION.op.epsbar_ig5_even_shift == 0 && ef*l->s_PRECISION.op.epsbar_ig5_odd_shift == 0 ) - vector_PRECISION_define( l->next_level->op_PRECISION.epsbar_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); + buffer_PRECISION_define( l->next_level->op_PRECISION.epsbar_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); else epsbar_term_PRECISION_setup( ef*l->s_PRECISION.op.epsbar, ef*l->s_PRECISION.op.epsbar_ig5_even_shift, ef*l->s_PRECISION.op.epsbar_ig5_odd_shift, &(l->next_level->op_PRECISION), @@ -149,7 +150,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t } -void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, +void set_block_diagonal_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) { // U(x) = [ A 0 , A=A*, D=D* @@ -162,16 +163,16 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION s aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, offset = l->num_parent_eig_vect, block_site_size = (num_eig_vect*(num_eig_vect+1)); - vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; + buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION block_pt; for ( k=0; k<=n; k++ ) { k1 = (n*(n+1))/2+k; k2 = (n*(n+1))/2+k+block_site_size/2; for ( j=0; jvector_buffer + j*aggregate_size; + spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size; + interpolation_data = V[k].vector_buffer + j*aggregate_size; block_pt = block + j*block_site_size; for ( i=0; iis_PRECISION.num_agg, @@ -194,7 +195,7 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, offset = l->num_parent_eig_vect, clover_site_size = (num_eig_vect*(2*num_eig_vect+1)); - vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; + buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; // U(x) = [ A B , A=A*, D=D*, C = -B* @@ -205,9 +206,9 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI k1 = (n*(n+1))/2+k; k2 = (n*(n+1))/2+k+(num_eig_vect*(num_eig_vect+1))/2; for ( j=0; jvector_buffer + j*aggregate_size; + spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size; + interpolation_data = V[k].vector_buffer + j*aggregate_size; clover_pt = clover + j*clover_site_size; for ( i=0; ivector_buffer + j*aggregate_size; + spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size; + interpolation_data = V[k].vector_buffer + j*aggregate_size; clover_pt = clover + j*clover_site_size; for ( i=0; iis_PRECISION.num_agg, @@ -250,7 +251,7 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P D_link_size = num_eig_vect*num_eig_vect*4, *index_dir = l->is_PRECISION.agg_boundary_index[mu], aggregate_boundary_sites = l->is_PRECISION.agg_boundary_length[mu]/num_aggregates; - vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; + buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION D_pt, D = l->next_level->op_PRECISION.D; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* @@ -264,8 +265,8 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P D_pt = D+(j*4+mu)*D_link_size; for ( i=0; ivector_buffer + nlsv*index_dir[i1]; + interpolation_data = V[k].vector_buffer + nlsv*index_dir[i1]; i1++; // A for ( m=0; mvector_buffer + nlsv*index_dir[i1]; + interpolation_data = V[k].vector_buffer + nlsv*index_dir[i1]; i1++; // B for ( m=0; mnum_block_sites, *length = s->dir_length, **index = s->index, *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var, num_eig_vect = l->num_parent_eig_vect; - vector_PRECISION lphi = phi+start, leta = eta+start; - + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+start; leta.vector_buffer = eta->vector_buffer+start; + vector_PRECISION leta1=leta, leta2=leta, lphi1=lphi, lphi2=lphi; + // site-wise self coupling #ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( eta, phi, &(s->op), (start/m), (start/m)+n, l); @@ -320,8 +323,13 @@ void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi for ( int i=0; iop.neighbor_table, @@ -363,16 +373,16 @@ void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PR length = l->is_PRECISION.agg_length[mu]; index_dir = l->is_PRECISION.agg_index[mu]; for ( i=0; ivector_buffer + n*index2; eta1_pt.vector_buffer = eta1->vector_buffer + n*index1; eta2_pt.vector_buffer = eta2->vector_buffer + n*index1; + coarse_spinwise_n_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l ); + phi_pt.vector_buffer = phi->vector_buffer + n*index1; eta1_pt.vector_buffer = eta1->vector_buffer + n*index2; eta2_pt.vector_buffer = eta2->vector_buffer + n*index2; + coarse_spinwise_n_daggered_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l ); } } } -void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, +void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { int i, index1, index2, length = l->is_PRECISION.agg_boundary_length[mu], @@ -390,12 +400,12 @@ void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vecto index1 = index_dir[i]; index2 = neighbor[i]; D_pt = D + Dss*index1 + Dls*mu; - phi_pt = phi + n*index2; eta1_pt = eta1 + n*index1; eta2_pt = eta2 + n*index1; - coarse_spinwise_hopp_PRECISION( eta1_pt, eta2_pt, phi_pt, D_pt, l ); + phi_pt.vector_buffer = phi->vector_buffer + n*index2; eta1_pt.vector_buffer = eta1->vector_buffer + n*index1; eta2_pt.vector_buffer = eta2->vector_buffer + n*index1; + coarse_spinwise_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l ); } } -void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void coarse_self_couplings_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, @@ -419,39 +429,40 @@ void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi } -void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, +void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION block, level_struct *l ) { int length = l->inner_vector_size, num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length; + vector_PRECISION phi_pt=*phi, eta1_pt=*eta1, eta2_pt=*eta2, phi_end_pt; + phi_end_pt.vector_buffer=phi->vector_buffer+length; // U(x) = [ A 0 , A=A*, D=D* // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise // diagonal coupling - while ( phi_pt < phi_end_pt ) { + while ( phi_pt.vector_buffer< phi_end_pt.vector_buffer ) { // A - mvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect ); - vector_PRECISION_define( eta2_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); - block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; + mvp_PRECISION( eta1_pt.vector_buffer, block_pt, phi_pt.vector_buffer, num_eig_vect ); + vector_PRECISION_define( &eta2_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); + block_pt += block_step_size; eta1_pt.vector_buffer += num_eig_vect; eta2_pt.vector_buffer += num_eig_vect; phi_pt.vector_buffer += num_eig_vect; // D - vector_PRECISION_define( eta1_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); - mvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect ); - block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; + vector_PRECISION_define( &eta1_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); + mvp_PRECISION( eta2_pt.vector_buffer, block_pt, phi_pt.vector_buffer, num_eig_vect ); + block_pt += block_step_size; eta1_pt.vector_buffer += num_eig_vect; eta2_pt.vector_buffer += num_eig_vect; phi_pt.vector_buffer += num_eig_vect; } } -void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, +void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION clover, int length, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, clover_step_size2 = SQUARE(num_eig_vect); config_PRECISION clover_pt = clover; - vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2+num_eig_vect, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer+num_eig_vect, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A B , A=A*, D=D*, C = -B* // C D ] // storage order: upper triangle of A, upper triangle of D, B, columnwise @@ -593,71 +604,71 @@ void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op #endif } -void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { +void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { int j, k=l->num_lattice_site_var/2; vector_PRECISION eta_end; - eta_end = eta+end; - phi += start; - eta += start; + eta_end.vector_buffer = eta->vector_buffer+end; + phi->vector_buffer += start; + eta->vector_buffer += start; - if ( eta != phi ) { - while ( eta < eta_end ) { + if ( eta->vector_buffer != phi->vector_buffer ) { + while ( eta->vector_buffer < eta_end.vector_buffer ) { for ( j=0; jvector_buffer = -(*phi->vector_buffer); + eta->vector_buffer++; phi->vector_buffer++; } for ( j=0; jvector_buffer = *phi->vector_buffer; + eta->vector_buffer++; phi->vector_buffer++; } } } else { - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end.vector_buffer ) { for ( j=0; jvector_buffer = -(*eta->vector_buffer); + eta->vector_buffer++; } - eta+=k; + eta->vector_buffer+=k; } } } -void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { +void coarse_tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int j, k=l->num_lattice_site_var/4; vector_PRECISION eta_end; - eta_end = eta+end; - phi += start; - eta += start; + eta_end.vector_buffer = eta->vector_buffer+end; + phi->vector_buffer += start; + eta->vector_buffer += start; - ASSERT( eta != phi ); - while ( eta < eta_end ) { - phi += k; + ASSERT( eta->vector_buffer != phi->vector_buffer ); + while ( eta->vector_buffer < eta_end.vector_buffer ) { + phi->vector_buffer += k; for ( j=0; jvector_buffer = -(*phi->vector_buffer); + eta->vector_buffer++; phi->vector_buffer++; } - phi -= 2*k; + phi->vector_buffer -= 2*k; for ( j=0; jvector_buffer = -(*phi->vector_buffer); + eta->vector_buffer++; phi->vector_buffer++; } - phi += 2*k; + phi->vector_buffer += 2*k; for ( j=0; jvector_buffer = *phi->vector_buffer; + eta->vector_buffer++; phi->vector_buffer++; } - phi -= 2*k; + phi->vector_buffer -= 2*k; for ( j=0; jvector_buffer = *phi->vector_buffer; + eta->vector_buffer++; phi->vector_buffer++; } - phi += k; + phi->vector_buffer += k; } } else #endif @@ -667,7 +678,7 @@ void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, i } } -void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _SC, threading ); @@ -693,7 +704,7 @@ void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi PROF_PRECISION_STOP( _NC, 1, threading ); } -void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void g5D_apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; compute_core_start_end_custom(0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var ); @@ -704,11 +715,11 @@ void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION } -void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - coarse_gamma5_PRECISION( l->vbuf_PRECISION[3], phi, threading->start_index[l->depth], threading->end_index[l->depth], l ); - apply_coarse_operator_PRECISION( eta, l->vbuf_PRECISION[3], op, l, threading ); + coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[3]), phi, threading->start_index[l->depth], threading->end_index[l->depth], l ); + apply_coarse_operator_PRECISION( eta, &(l->vbuf_PRECISION[3]), op, l, threading ); coarse_gamma5_PRECISION( eta, eta, threading->start_index[l->depth], threading->end_index[l->depth], l ); } @@ -719,14 +730,15 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr int vs = l->vector_size, ivs = l->inner_vector_size, cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size; PRECISION diff = 0; - vector_PRECISION vp1=NULL, vp2, vp3, vp4, vc1=NULL, vc2, vc3; + vector_PRECISION *vp1=NULL, vp2, vp3, vp4, *vc1=NULL, vc2, vc3; - PUBLIC_MALLOC( vp1, complex_PRECISION, 4*vs ); - PUBLIC_MALLOC( vc1, complex_PRECISION, 3*cvs ); + PUBLIC_MALLOC( vp1->vector_buffer, complex_PRECISION, 4*vs ); + PUBLIC_MALLOC( vc1->vector_buffer, complex_PRECISION, 3*cvs ); SYNC_MASTER_TO_ALL(threading) - vp2 = vp1 + vs; vp3 = vp2 + vs; vp4 = vp3 + vs; vc2 = vc1 + cvs; vc3 = vc2 + cvs; + vp2.vector_buffer = vp1->vector_buffer + vs; vp3.vector_buffer = vp2.vector_buffer + vs; vp4.vector_buffer = vp3.vector_buffer + vs; + vc2.vector_buffer = vc1->vector_buffer + cvs; vc3.vector_buffer = vc2.vector_buffer + cvs; START_LOCKED_MASTER(threading) #ifdef HAVE_TM1p1 @@ -744,40 +756,40 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); diff = dot/norm; #else - diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading ) - / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading ); + diff = global_inner_product_PRECISION( &(l->is_PRECISION.interpolation[0]), &(l->is_PRECISION.interpolation[1]), 0, ivs, l, no_threading ) + / global_norm_PRECISION( &(l->is_PRECISION.interpolation[0]), 0, ivs, l, no_threading ); #endif test0_PRECISION("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) ); } if ( !l->next_level->idle ) vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); - vector_PRECISION_distribute( vc2, vc1, l->next_level ); - vector_PRECISION_gather( vc3, vc2, l->next_level ); + vector_PRECISION_distribute( &vc2, vc1, l->next_level ); + vector_PRECISION_gather( &vc3, &vc2, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc2, vc1, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); } test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff ); if ( !l->next_level->idle ) vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); interpolate3_PRECISION( vp1, vc1, l, no_threading ); - restrict_PRECISION( vc2, vp1, l, no_threading ); + restrict_PRECISION( &vc2, vp1, l, no_threading ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc3, vc1, &vc2, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) ); } END_LOCKED_MASTER(threading) if(threading->n_core>1) { interpolate3_PRECISION( vp1, vc1, l, threading ); - restrict_PRECISION( vc2, vp1, l, threading ); + restrict_PRECISION( &vc2, vp1, l, threading ); START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc3, vc1, &vc2, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -785,27 +797,27 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if (l->depth==0) - gamma5_PRECISION( vp2, vp1, l, no_threading ); + gamma5_PRECISION( &vp2, vp1, l, no_threading ); else - coarse_gamma5_PRECISION( vp2, vp1, 0, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); - coarse_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level ); + coarse_gamma5_PRECISION( &vp2, vp1, 0, ivs, l ); + restrict_PRECISION( &vc2, &vp2, l, no_threading ); + coarse_gamma5_PRECISION( &vc3, &vc2, 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc2, vc1, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } #ifdef HAVE_TM1p1 if(g.n_flavours == 2) { if (l->depth==0) - tau1_gamma5_PRECISION( vp2, vp1, l, no_threading ); + tau1_gamma5_PRECISION( &vp2, vp1, l, no_threading ); else - coarse_tau1_gamma5_PRECISION( vp2, vp1, 0, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); - coarse_tau1_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level ); + coarse_tau1_gamma5_PRECISION( &vp2, vp1, 0, ivs, l ); + restrict_PRECISION( &vc2, &vp2, l, no_threading ); + coarse_tau1_gamma5_PRECISION( &vc3, &vc2, 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc2, vc1, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } } @@ -813,32 +825,32 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr END_LOCKED_MASTER(threading) START_LOCKED_MASTER(threading) - vector_PRECISION_define( vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp2, 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs ); + add_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.odd_proj, ivs ); else - coarse_add_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_add_block_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l ); + restrict_PRECISION( &vc2, &vp2, l, no_threading ); - vector_PRECISION_scale( vc2, vc2, -1.0, 0, civs, l->next_level ); - coarse_add_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc2, &vc2, -1.0, 0, civs, l->next_level ); + coarse_add_block_diagonal_PRECISION( &vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) #ifdef HAVE_TM START_LOCKED_MASTER(threading) if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { - vector_PRECISION_define( vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp2, 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs ); + add_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_add_anti_block_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l ); + restrict_PRECISION( &vc2, &vp2, l, no_threading ); - vector_PRECISION_scale( vc2, vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); - coarse_add_anti_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc2, &vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); + coarse_add_anti_block_diagonal_PRECISION( &vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -848,16 +860,16 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if ( g.n_flavours == 2 && ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) { - vector_PRECISION_define( vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp2, 0, 0, ivs, l ); if (l->depth==0) - apply_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs ); + apply_doublet_coupling_PRECISION( &vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs ); else - coarse_add_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_add_doublet_coupling_PRECISION( &vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); + restrict_PRECISION( &vc2, &vp2, l, no_threading ); - vector_PRECISION_scale( vc2, vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); - coarse_add_doublet_coupling_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc2, &vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); + coarse_add_doublet_coupling_PRECISION( &vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -867,28 +879,28 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) interpolate3_PRECISION( vp1, vc1, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &vp2, vp1, &(l->p_PRECISION), l, no_threading ); #ifdef HAVE_TM if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) { - vector_PRECISION_scale( vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); + vector_PRECISION_scale( &vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); if(l->depth == 0) - add_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs ); + add_diagonal_PRECISION( &vp2, &vp3, l->p_PRECISION.op->tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs, l ); + coarse_add_anti_block_diagonal_PRECISION( &vp2, &vp3, l->p_PRECISION.op->tm_term, ivs, l ); } #endif - restrict_PRECISION( vc2, vp2, l, no_threading ); + restrict_PRECISION( &vc2, &vp2, l, no_threading ); if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, no_threading ); + coarse_odd_even_PRECISION_test( &vc3, vc1, l->next_level, no_threading ); else - apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); + apply_operator_PRECISION( &vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); - vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc3, &vc2, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); @@ -901,14 +913,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if(threading->n_core>1) { if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, threading ); + coarse_odd_even_PRECISION_test( &vc3, vc1, l->next_level, threading ); else - apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, threading ); + apply_operator_PRECISION( &vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, threading ); } START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc3, &vc2, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { //TODO: this test doesn't work without SSE!! test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); } else { @@ -922,27 +934,27 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) { vector_PRECISION_define_random( vp1, 0, ivs, l ); - block_to_oddeven_PRECISION( vp4, vp1, l, no_threading ); - coarse_diag_ee_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading ); - coarse_diag_oo_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading ); - coarse_hopping_term_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); - oddeven_to_block_PRECISION( vp4, vp3, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); - vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l ); - diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading ); + block_to_oddeven_PRECISION( &vp4, vp1, l, no_threading ); + coarse_diag_ee_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), l, no_threading ); + coarse_diag_oo_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), l, no_threading ); + coarse_hopping_term_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); + oddeven_to_block_PRECISION( &vp4, &vp3, l, no_threading ); + apply_operator_PRECISION( &vp2, vp1, &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_minus( &vp4, &vp4, &vp2, 0, ivs, l ); + diff = global_norm_PRECISION( &vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp2, 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff ); - block_to_oddeven_PRECISION( vp4, vp1, l, no_threading ); - coarse_odd_even_PRECISION_test( vp3, vp4, l, no_threading ); - oddeven_to_block_PRECISION( vp4, vp3, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); - vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l ); - diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading ); + block_to_oddeven_PRECISION( &vp4, vp1, l, no_threading ); + coarse_odd_even_PRECISION_test( &vp3, &vp4, l, no_threading ); + oddeven_to_block_PRECISION( &vp4, &vp3, l, no_threading ); + apply_operator_PRECISION( &vp2, vp1, &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_minus( &vp4, &vp4, &vp2, 0, ivs, l ); + diff = global_norm_PRECISION( &vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp2, 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff ); } - FREE( vp1, complex_PRECISION, 4*vs ); - FREE( vc1, complex_PRECISION, 3*cvs ); + FREE( vp1->vector_buffer, complex_PRECISION, 4*vs ); + FREE( vc1->vector_buffer, complex_PRECISION, 3*cvs ); END_LOCKED_MASTER(threading) if ( g.method != 6 && l->next_level->level > 0 && !l->next_level->idle ) { diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h index 3af0655..59e3b62 100644 --- a/src/coarse_operator_generic.h +++ b/src/coarse_operator_generic.h @@ -35,40 +35,40 @@ void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void set_coarse_self_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2, + void set_coarse_self_coupling_PRECISION( vector_PRECISION *buffer1, vector_PRECISION *buffer2, vector_PRECISION *V, const int n, level_struct *l ); - void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2, + void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION *buffer1, vector_PRECISION *buffer2, vector_PRECISION *V, const int mu, const int n, level_struct *l ); - void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void coarse_self_couplings_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l ); - void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, + void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION clover, int length, level_struct *l ); - void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ); - void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ); - void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ); + void coarse_tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ); + void apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, + void g5D_apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, + void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, + void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ); - void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); + void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); - void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ); + void set_block_diagonal_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ); - void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l ); + void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION block, level_struct *l ); void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading ); // eta += D*phi, D stored columnwise - static inline void mv_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, - const vector_PRECISION phi, const register int n ) { + static inline void mv_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, + const buffer_PRECISION phi, const register int n ) { register int i, j, k=0; for ( i=0; inum_lattice_site_var, @@ -199,7 +199,7 @@ clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, clover_step_size2 = SQUARE(num_eig_vect); config_PRECISION clover_pt = clover; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A B , A=A*, D=D*, C = -B* // C D ] // storage order: upper triangle of A, upper triangle of D, B, columnwise @@ -257,13 +257,13 @@ } } - static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION block, int length, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A 0 , A=A*, D=D* diag. excluded // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise @@ -294,13 +294,13 @@ } } - static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION block, int length, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A 0 , A=-A*, D=-D* diag. excluded // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise @@ -331,14 +331,14 @@ } } - static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION block, int length, level_struct *l ) { #ifdef HAVE_TM1p1 int num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ 0 A , A=-A*, D=-D* diag. excluded // D 0 ] // storage order: upper triangle of A, upper triangle of D, columnwise @@ -360,7 +360,7 @@ #endif } - static inline void coarse_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, @@ -373,58 +373,58 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//1 + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // C - eta += num_eig_vect;//2 - phi -= num_eig_vect;//0 + eta->vector_buffer += num_eig_vect;//2 + phi->vector_buffer -= num_eig_vect;//0 D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//1 + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // B - eta -= 3*num_eig_vect;//0 - phi += num_eig_vect;//2 + eta->vector_buffer -= 3*num_eig_vect;//0 + phi->vector_buffer += num_eig_vect;//2 D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//3 + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D - eta += num_eig_vect;//2 - phi -= num_eig_vect;//2 + eta->vector_buffer += num_eig_vect;//2 + phi->vector_buffer -= num_eig_vect;//2 D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//3 + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } else { #endif // A - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // C - eta += num_eig_vect; + eta->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // B - phi += num_eig_vect; - eta -= num_eig_vect; + phi->vector_buffer += num_eig_vect; + eta->vector_buffer -= num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D - eta += num_eig_vect; + eta->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, @@ -437,57 +437,57 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A* - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//1 + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -C* - eta -= num_eig_vect;//0 - phi += num_eig_vect;//2 + eta->vector_buffer -= num_eig_vect;//0 + phi->vector_buffer += num_eig_vect;//2 D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//3 + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -B* - eta += num_eig_vect;//2 - phi -= 3*num_eig_vect;//0 + eta->vector_buffer += num_eig_vect;//2 + phi->vector_buffer -= 3*num_eig_vect;//0 D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//1 + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D* - eta -= num_eig_vect;//2 - phi += num_eig_vect;//2 + eta->vector_buffer -= num_eig_vect;//2 + phi->vector_buffer += num_eig_vect;//2 D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//3 + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } else { #endif // A* - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -C* - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -B* - eta += num_eig_vect; - phi -= num_eig_vect; + eta->vector_buffer += num_eig_vect; + phi->vector_buffer -= num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D* - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_n_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_n_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, @@ -500,57 +500,57 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//1 + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // C - eta += num_eig_vect;//2 - phi -= num_eig_vect;//0 + eta->vector_buffer += num_eig_vect;//2 + phi->vector_buffer -= num_eig_vect;//0 D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//1 + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // B - eta -= 3*num_eig_vect;//0 - phi += num_eig_vect;//2 + eta->vector_buffer -= 3*num_eig_vect;//0 + phi->vector_buffer += num_eig_vect;//2 D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//3 + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D - eta += num_eig_vect;//2 - phi -= num_eig_vect;//2 + eta->vector_buffer += num_eig_vect;//2 + phi->vector_buffer -= num_eig_vect;//2 D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//3 + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } else { #endif // A - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // C - eta += num_eig_vect; + eta->vector_buffer += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // B - phi += num_eig_vect; - eta -= num_eig_vect; + phi->vector_buffer += num_eig_vect; + eta->vector_buffer -= num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D - eta += num_eig_vect; + eta->vector_buffer += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, @@ -563,58 +563,58 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A* - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//1 + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -C* - eta -= num_eig_vect;//0 - phi += num_eig_vect;//2 + eta->vector_buffer -= num_eig_vect;//0 + phi->vector_buffer += num_eig_vect;//2 D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//1 + phi->vector_buffer += num_eig_vect;//3 + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -B* - eta += num_eig_vect;//2 - phi -= 3*num_eig_vect;//0 + eta->vector_buffer += num_eig_vect;//2 + phi->vector_buffer -= 3*num_eig_vect;//0 D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//1 + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D* - eta -= num_eig_vect;//2 - phi += num_eig_vect;//2 + eta->vector_buffer -= num_eig_vect;//2 + phi->vector_buffer += num_eig_vect;//2 D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + eta->vector_buffer += num_eig_vect;//3 + phi->vector_buffer += num_eig_vect;//3 + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } else { #endif // A* - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -C* - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -B* - eta += num_eig_vect; - phi -= num_eig_vect; + eta->vector_buffer += num_eig_vect; + phi->vector_buffer -= num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D* - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); @@ -624,24 +624,24 @@ // note: minus sign of D = self_coupling - hopping_term is added here // A - mv_PRECISION( eta1, D, phi, num_eig_vect ); + mv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // C - eta1 += num_eig_vect; + eta1->vector_buffer += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta1, D, phi, num_eig_vect ); + mv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // B - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta2, D, phi, num_eig_vect ); + mv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D - eta2 += num_eig_vect; + eta2->vector_buffer += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta2, D, phi, num_eig_vect ); + mv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } - static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); @@ -651,25 +651,25 @@ // note: minus sign of D = self_coupling - hopping_term is added here // A* - mvh_PRECISION( eta1, D, phi, num_eig_vect ); + mvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -C* - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta2, D, phi, num_eig_vect ); + nmvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -B* - eta1 += num_eig_vect; - phi -= num_eig_vect; + eta1->vector_buffer += num_eig_vect; + phi->vector_buffer -= num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta1, D, phi, num_eig_vect ); + nmvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D* - eta2 += num_eig_vect; - phi += num_eig_vect; + eta2->vector_buffer += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta2, D, phi, num_eig_vect ); + mvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } - static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); @@ -679,24 +679,24 @@ // note: minus sign of D = self_coupling - hopping_term is added here // A - nmv_PRECISION( eta1, D, phi, num_eig_vect ); + nmv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // C - eta1 += num_eig_vect; + eta1->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta1, D, phi, num_eig_vect ); + nmv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // B - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta2, D, phi, num_eig_vect ); + nmv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D - eta2 += num_eig_vect; + eta2->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta2, D, phi, num_eig_vect ); + nmv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } - static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); @@ -706,21 +706,21 @@ // note: minus sign of D = self_coupling - hopping_term is added here // A* - nmvh_PRECISION( eta1, D, phi, num_eig_vect ); + nmvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -C* - phi += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta2, D, phi, num_eig_vect ); + mvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // -B* - eta1 += num_eig_vect; - phi -= num_eig_vect; + eta1->vector_buffer += num_eig_vect; + phi->vector_buffer -= num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta1, D, phi, num_eig_vect ); + mvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); // D* - eta2 += num_eig_vect; - phi += num_eig_vect; + eta2->vector_buffer += num_eig_vect; + phi->vector_buffer += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta2, D, phi, num_eig_vect ); + nmvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); } #endif diff --git a/src/coarsening_generic.c b/src/coarsening_generic.c index ae7893b..a3c6313 100644 --- a/src/coarsening_generic.c +++ b/src/coarsening_generic.c @@ -30,7 +30,7 @@ void interpolation_PRECISION_struct_init( interpolation_PRECISION_struct *is ) { is->test_vector = NULL; is->interpolation = NULL; is->eigenvalues = NULL; - is->tmp = NULL; + vector_PRECISION_init(&(is->tmp)); is->bootstrap_vector = NULL; is->bootstrap_eigenvalues = NULL; } diff --git a/src/data_generic.c b/src/data_generic.c index 950c814..ba63e3c 100644 --- a/src/data_generic.c +++ b/src/data_generic.c @@ -22,7 +22,7 @@ #include "main.h" // vector storage for PRECISION precision -void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ) { +void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -39,7 +39,7 @@ void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int } -void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l ) { +void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -47,7 +47,7 @@ void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, l if ( phi != NULL ) { int i; for ( i=start; ivector_buffer[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I; } else { error0("Error in \"vector_PRECISION_define_random\": pointer is null\n"); } diff --git a/src/data_generic.h b/src/data_generic.h index b236ab4..9ac8a58 100644 --- a/src/data_generic.h +++ b/src/data_generic.h @@ -22,7 +22,7 @@ #ifndef DATA_PRECISION_HEADER #define DATA_PRECISION_HEADER - void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ); - void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l ); + void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); + void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l ); #endif diff --git a/src/dirac.c b/src/dirac.c index 068e8a7..94d4aca 100644 --- a/src/dirac.c +++ b/src/dirac.c @@ -44,14 +44,14 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { #ifdef HAVE_TM if ( g.mu + g.mu_even_shift == 0 && g.mu + g.mu_odd_shift == 0 ) - vector_double_define( op->tm_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); + buffer_double_define( op->tm_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); else tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, op, l, no_threading ); #endif #ifdef HAVE_TM1p1 if ( g.epsbar == 0 && g.epsbar_ig5_even_shift == 0 && g.epsbar_ig5_odd_shift == 0 ) - vector_double_define( op->epsbar_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); + buffer_double_define( op->epsbar_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); else epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, op, l, no_threading ); #endif @@ -86,7 +86,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { mat_free( &Qstore, 3 ); spin_free( 4, 4 ); } else { - vector_double_define( op->clover, 4+op->m0, 0, l->inner_vector_size, l ); + buffer_double_define( op->clover, 4+op->m0, 0, l->inner_vector_size, l ); } } @@ -436,7 +436,12 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { int t, z, y, x, mu, nu, *ll = l->local_lattice, ls[4], le[4]; long int i, j, send_size, max_size; - vector_double buffer1 = NULL, buffer2 = NULL, buffer3 = NULL, buffer4 = NULL; + vector_double buffer1, buffer2, buffer3, buffer4; + + vector_double_init(&buffer1); + vector_double_init(&buffer2); + vector_double_init(&buffer3); + vector_double_init(&buffer4); max_size = 0; for ( mu=0; mu<4; mu++ ) { @@ -448,10 +453,10 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { if (send_size > max_size) max_size = send_size; } - MALLOC( buffer1, complex_double, max_size ); - MALLOC( buffer2, complex_double, max_size ); - MALLOC( buffer3, complex_double, max_size ); - MALLOC( buffer4, complex_double, max_size ); + MALLOC( buffer1.vector_buffer, complex_double, max_size ); + MALLOC( buffer2.vector_buffer, complex_double, max_size ); + MALLOC( buffer3.vector_buffer, complex_double, max_size ); + MALLOC( buffer4.vector_buffer, complex_double, max_size ); for ( mu=0; mu<4; mu++ ) { ls[mu] = 1; @@ -467,13 +472,13 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { for ( y=ls[Y]; yneighbor_rank[2*mu], 2*mu, g.comm_cart, &(g.rreqs[2*mu]) ); - MPI_Isend( buffer1, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu+1], 2*mu, g.comm_cart, &(g.sreqs[2*mu]) ); + MPI_Irecv( buffer3.vector_buffer, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu], 2*mu, g.comm_cart, &(g.rreqs[2*mu]) ); + MPI_Isend( buffer1.vector_buffer, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu+1], 2*mu, g.comm_cart, &(g.sreqs[2*mu]) ); // send own positive inner boundary ls[mu] = ll[mu]; @@ -483,13 +488,13 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { for ( y=ls[Y]; yneighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(g.rreqs[2*mu+1]) ); - MPI_Isend( buffer2, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(g.sreqs[2*mu+1]) ); + MPI_Irecv( buffer4.vector_buffer, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(g.rreqs[2*mu+1]) ); + MPI_Isend( buffer2.vector_buffer, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(g.sreqs[2*mu+1]) ); //recv own positive boundary MPI_Wait( &(g.sreqs[2*mu]), MPI_STATUS_IGNORE ); @@ -502,7 +507,7 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { for ( y=ls[Y]; ynum_lattice_site_var; - vector_PRECISION lphi = phi+start, leta = eta+start; - vector_PRECISION leta_end = eta+end; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; + buffer_PRECISION leta_end = eta->vector_buffer+end; #ifdef PROFILING START_MASTER(threading) @@ -133,7 +133,7 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC #endif clover += start*12; while ( leta < leta_end ) { // tm_term included in the clover vectorized - sse_site_clover_PRECISION( (PRECISION*) leta, (PRECISION*) lphi, clover ); + sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover ); leta += nv; lphi += nv; clover += 12*nv; } @@ -144,7 +144,7 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC #ifdef HAVE_TM1p1 config_PRECISION eps_term = op->epsbar_term+(start/nv)*12; - lphi = phi+start, leta = eta+start; + lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; if ( g.n_flavours == 2 && ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) while ( leta < leta_end ) { @@ -166,47 +166,47 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC } -static void spin0and1_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) { +static void spin0and1_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { - vector_PRECISION eta_end = eta + l->inner_vector_size; + buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size; if ( g.csw == 0.0 ) { - while ( eta < eta_end ) { - FOR6( *eta = (*phi)*(*clover); eta++; phi++; clover++; ) - FOR6( *eta = _COMPLEX_PRECISION_ZERO; eta++; ) - phi+=6; clover+=6; + while ( eta->vector_buffer < eta_end ) { + FOR6( *eta->vector_buffer = (*phi->vector_buffer)*(*clover); eta->vector_buffer++; phi->vector_buffer++; clover++; ) + FOR6( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; eta->vector_buffer++; ) + phi->vector_buffer+=6; clover+=6; } } else { - while ( eta < eta_end ) { - spin0and1_site_clover_PRECISION( eta, phi, clover ); - eta+=12; phi+=12; clover+=42; + while ( eta->vector_buffer < eta_end ) { + spin0and1_site_clover_PRECISION( eta->vector_buffer, phi->vector_buffer, clover ); + eta->vector_buffer+=12; phi->vector_buffer+=12; clover+=42; } } } -static void spin2and3_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) { +static void spin2and3_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { - vector_PRECISION eta_end = eta + l->inner_vector_size; + buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size; if ( g.csw == 0.0 ) { - while ( eta < eta_end ) { - phi+=6; clover+=6; - FOR6( *eta = _COMPLEX_PRECISION_ZERO; eta++; ) - FOR6( *eta = (*phi)*(*clover); eta++; phi++; clover++; ) + while ( eta->vector_buffer < eta_end ) { + phi->vector_buffer+=6; clover+=6; + FOR6( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; eta->vector_buffer++; ) + FOR6( *eta->vector_buffer = (*phi->vector_buffer)*(*clover); eta->vector_buffer++; phi->vector_buffer++; clover++; ) } } else { - while ( eta < eta_end ) { - spin2and3_site_clover_PRECISION( eta, phi, clover ); - eta +=12; phi+=12; clover+=42; + while ( eta->vector_buffer < eta_end ) { + spin2and3_site_clover_PRECISION( eta->vector_buffer, phi->vector_buffer, clover ); + eta->vector_buffer +=12; phi->vector_buffer+=12; clover+=42; } } } -void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { +void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) int n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; - vector_PRECISION lphi = phi+start, leta = eta+start; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; // clover term clover_PRECISION(eta, phi, &(s->op), start, start+nv*n, l, no_threading ); @@ -349,7 +349,7 @@ void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, END_UNTHREADED_FUNCTION(threading) } -void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var; #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION @@ -357,7 +357,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; #else int i, j, *nb_pt; - vector_PRECISION phi_pt, eta_pt, end_pt; + buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; #endif @@ -374,10 +374,10 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi, start, end ); + dprp_PRECISION( prn, phi->vector_buffer, start, end ); #else complex_PRECISION pbuf[12]; - for ( i=start/2, phi_pt=phi+start; ivector_buffer+start; iprnT+i, phi_pt ); dprp_Z_PRECISION( op->prnZ+i, phi_pt ); dprp_Y_PRECISION( op->prnY+i, phi_pt ); @@ -393,10 +393,10 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi, op, neighbor, start, end ); + dprn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); #else // project plus dir and multiply with U dagger - for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptvector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptvector_buffer, prn, op, neighbor, start, end ); #else // multiply with U and lift up minus dir - for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptvector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnT+j ); @@ -487,9 +487,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat // lift up plus dir #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta, prp, start, end ); + dpbn_PRECISION( eta->vector_buffer, prp, start, end ); #else - for ( i=start/2, eta_pt=eta+start; ivector_buffer+start; iprpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); @@ -500,10 +500,10 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat #endif #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi, start, end ); + prp_PRECISION( prn, phi->vector_buffer, start, end ); #else complex_PRECISION pbuf[6]; - for ( i=start/2, phi_pt=phi+start; ivector_buffer+start; iprnT+i, phi_pt ); prp_Z_PRECISION( op->prnZ+i, phi_pt ); prp_Y_PRECISION( op->prnY+i, phi_pt ); @@ -520,9 +520,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat // project plus dir and multiply with U dagger #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi, op, neighbor, start, end ); + prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); #else - for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptvector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptvector_buffer, prn, op, neighbor, start, end ); #else - for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptvector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnT+j ); @@ -597,9 +597,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat // lift up plus dir #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta, prp, start, end ); + pbn_PRECISION( eta->vector_buffer, prp, start, end ); #else - for ( i=start/2, eta_pt=eta+start; ivector_buffer+start; iprpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); @@ -618,48 +618,48 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat } -void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - while ( eta < eta_end ) { - FOR12( *eta = -(*phi); phi++; eta++; ) - FOR12( *eta = (*phi); phi++; eta++; ) + while ( eta->vector_buffer < eta_end ) { + FOR12( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) + FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) } } else #endif - while ( eta < eta_end ) { - FOR6( *eta = -(*phi); phi++; eta++; ) - FOR6( *eta = (*phi); phi++; eta++; ) + while ( eta->vector_buffer < eta_end ) { + FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) + FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) } } -void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; complex_PRECISION b[6]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; - while ( eta < eta_end ) { + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; + while ( eta->vector_buffer < eta_end ) { int i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); + FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); i = 0; - FOR6( *eta = - b[i] ; eta++; i++; ); + FOR6( *eta->vector_buffer = - b[i] ; eta->vector_buffer++; i++; ); i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); + FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); i = 0; - FOR6( *eta = b[i] ; eta++; i++; ); + FOR6( *eta->vector_buffer = b[i] ; eta->vector_buffer++; i++; ); } } else #endif @@ -671,100 +671,100 @@ void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_st } } -void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_ODD) { - FOR24( *eta = (*phi); phi++; eta++; ); + FOR24( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_EVEN) { - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_ODD) { - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR12( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } } -void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = -(*phi); phi++; eta++; ); - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_ODD){ - FOR6( *eta = -(*phi); phi++; eta++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR12( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } } -void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; complex_PRECISION b[6]; - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_ODD){ int i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); + FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); i = 0; - FOR6( *eta = - b[i] ; eta++; i++; ); + FOR6( *eta->vector_buffer = - b[i] ; eta->vector_buffer++; i++; ); i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); + FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); i = 0; - FOR6( *eta = b[i] ; eta++; i++; ); + FOR6( *eta->vector_buffer = b[i] ; eta->vector_buffer++; i++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } @@ -778,96 +778,96 @@ void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECIS } } -void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = (*phi); phi++; eta++; ); + FOR24( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = 0; phi++; eta++; ); + FOR24( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_ODD) { - FOR12( *eta = 0; phi++; eta++; ); + FOR12( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } } -void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = -(*phi); phi++; eta++; ); - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = 0; phi++; eta++; ); + FOR24( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR6( *eta = -(*phi); phi++; eta++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = 0; phi++; eta++; ); + FOR12( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } } -void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; complex_PRECISION b[6]; - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN){ int i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); + FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); i = 0; - FOR6( *eta = - b[i] ; eta++; i++; ); + FOR6( *eta->vector_buffer = - b[i] ; eta->vector_buffer++; i++; ); i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); + FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); i = 0; - FOR6( *eta = b[i] ; eta++; i++; ); + FOR6( *eta->vector_buffer = b[i] ; eta->vector_buffer++; i++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); } i++; } @@ -881,40 +881,40 @@ void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISI } } -void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, +void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, complex_double even, complex_double odd, level_struct *l, struct Thread *threading ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + eta->vector_buffer += threading->start_index[l->depth]; + phi->vector_buffer += threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = even*(*phi); phi++; eta++; ); + FOR24( *eta->vector_buffer = even*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = odd*(*phi); phi++; eta++; ); + FOR24( *eta->vector_buffer = odd*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( eta->vector_buffer < eta_end ) { if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta = even*(*phi); phi++; eta++; ); + FOR12( *eta->vector_buffer = even*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } else if(g.odd_even_table[i]==_ODD) { - FOR12( *eta = odd*(*phi); phi++; eta++; ); + FOR12( *eta->vector_buffer = odd*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); } i++; } } -void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) { +void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 @@ -927,23 +927,23 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION vector_PRECISION serial_end; if( g.n_flavours == 2 ) { - serial_end = serial + threading->end_index[l->depth]; - serial += threading->start_index[l->depth]; - flav1 += threading->start_index[l->depth]/2; - flav2 += threading->start_index[l->depth]/2; + serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]; + serial->vector_buffer += threading->start_index[l->depth]; + flav1->vector_buffer += threading->start_index[l->depth]/2; + flav2->vector_buffer += threading->start_index[l->depth]/2; } else { - serial_end = serial + threading->end_index[l->depth]*2; - serial += threading->start_index[l->depth]*2; - flav1 += threading->start_index[l->depth]; - flav2 += threading->start_index[l->depth]; + serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]*2; + serial->vector_buffer += threading->start_index[l->depth]*2; + flav1->vector_buffer += threading->start_index[l->depth]; + flav2->vector_buffer += threading->start_index[l->depth]; } - while ( serial < serial_end ) { - FOR6( *serial = (*flav1); serial++; flav1++; ) - FOR6( *serial = (*flav2); serial++; flav2++; ) - FOR6( *serial = (*flav1); serial++; flav1++; ) - FOR6( *serial = (*flav2); serial++; flav2++; ) + while ( serial->vector_buffer < serial_end.vector_buffer ) { + FOR6( *serial->vector_buffer = (*flav1->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) + FOR6( *serial->vector_buffer = (*flav2->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) + FOR6( *serial->vector_buffer = (*flav1->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) + FOR6( *serial->vector_buffer = (*flav2->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) } #else START_MASTER(threading) @@ -953,29 +953,29 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION } -void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) { +void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 vector_PRECISION serial_end; if( g.n_flavours == 2 ) { - serial_end = serial + threading->end_index[l->depth]; - serial += threading->start_index[l->depth]; - flav1 += threading->start_index[l->depth]/2; - flav2 += threading->start_index[l->depth]/2; + serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]; + serial->vector_buffer += threading->start_index[l->depth]; + flav1->vector_buffer += threading->start_index[l->depth]/2; + flav2->vector_buffer += threading->start_index[l->depth]/2; } else { - serial_end = serial + threading->end_index[l->depth]*2; - serial += threading->start_index[l->depth]*2; - flav1 += threading->start_index[l->depth]; - flav2 += threading->start_index[l->depth]; + serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]*2; + serial->vector_buffer += threading->start_index[l->depth]*2; + flav1->vector_buffer += threading->start_index[l->depth]; + flav2->vector_buffer += threading->start_index[l->depth]; } - while ( serial < serial_end ) { - FOR6( *flav1 = (*serial); serial++; flav1++; ) - FOR6( *flav2 = (*serial); serial++; flav2++; ) - FOR6( *flav1 = (*serial); serial++; flav1++; ) - FOR6( *flav2 = (*serial); serial++; flav2++; ) + while ( serial->vector_buffer < serial_end.vector_buffer ) { + FOR6( *flav1->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) + FOR6( *flav2->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) + FOR6( *flav1->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) + FOR6( *flav2->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) } #else START_MASTER(threading) @@ -985,28 +985,28 @@ void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION } -void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { d_plus_clover_PRECISION( eta, phi, op, l, threading ); SYNC_CORES(threading) gamma5_PRECISION( eta, eta, l, threading ); SYNC_CORES(threading) } -void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ) { +void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ) { - vector_PRECISION eta_end = eta1 + l->inner_vector_size; + buffer_PRECISION eta_end = eta1->vector_buffer + l->inner_vector_size; - while ( eta1 < eta_end ) { - FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ); - FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ); + while ( eta1->vector_buffer < eta_end ) { + FOR6( *eta1->vector_buffer = (*phi->vector_buffer)*(*diag); *eta2->vector_buffer = _COMPLEX_PRECISION_ZERO; eta1->vector_buffer++; eta2->vector_buffer++; phi->vector_buffer++; diag++; ); + FOR6( *eta2->vector_buffer = (*phi->vector_buffer)*(*diag); *eta1->vector_buffer = _COMPLEX_PRECISION_ZERO; eta1->vector_buffer++; eta2->vector_buffer++; phi->vector_buffer++; diag++; ); } } -void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l ) { +void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ) { int i, length, index1, index2, *index_dir, *neighbor = s->op.neighbor_table; - vector_PRECISION eta1_pt, eta2_pt, phi_pt; + buffer_PRECISION eta1_pt, eta2_pt, phi_pt; complex_PRECISION buffer1[12], buffer2[12]; config_PRECISION D_pt, D = s->op.D; @@ -1018,84 +1018,84 @@ void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION length = l->is_PRECISION.agg_length[T]; index_dir = l->is_PRECISION.agg_index[T]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*T; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_T_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_T_PRECISION( eta1_pt, eta2_pt, buffer2 ); } // Z dir length = l->is_PRECISION.agg_length[Z]; index_dir = l->is_PRECISION.agg_index[Z]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*Z; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_Z_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_Z_PRECISION( eta1_pt, eta2_pt, buffer2 ); } // Y dir length = l->is_PRECISION.agg_length[Y]; index_dir = l->is_PRECISION.agg_index[Y]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*Y; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_Y_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_Y_PRECISION( eta1_pt, eta2_pt, buffer2 ); } // X dir length = l->is_PRECISION.agg_length[X]; index_dir = l->is_PRECISION.agg_index[X]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*X; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_X_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_X_PRECISION( eta1_pt, eta2_pt, buffer2 ); } } -void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { +void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { int i, length, index1, index2, *index_dir, *neighbor; - vector_PRECISION eta1_pt, eta2_pt, phi_pt; + buffer_PRECISION eta1_pt, eta2_pt, phi_pt; complex_PRECISION buffer1[12]; config_PRECISION D_pt, D = s->op.D; @@ -1108,54 +1108,54 @@ void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta // T dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*T; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_T_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } else if ( mu == Z ) { // Z dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*Z; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_Z_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } else if ( mu == Y ) { // Y dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*Y; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_Y_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } else if ( mu == X ) { // X dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*X; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_X_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } } -void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l) { +void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l) { int t, z, y, x, i; int *gl=l->global_lattice, sl[4]; double phase[4]; @@ -1174,10 +1174,10 @@ void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISIO twisted_bc = exp(I*phase[X]); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { - FOR24( *eta = (*phi)*twisted_bc; phi++; eta++; ); + FOR24( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ); } else #endif - { FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; ) } + { FOR12( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ) } } } } @@ -1457,8 +1457,11 @@ void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l #ifdef HAVE_TM1p1 double diff; - vector_double vd1=NULL, vd2, vd3, vd4, vdd1, vdd2, vdd3, vdd4; - vector_PRECISION vpp1=NULL, vpp2; + vector_double vd1, vd2, vd3, vd4, vdd1, vdd2, vdd3, vdd4; + vector_PRECISION vpp1, vpp2; + + vector_double_init(&vd1); + vector_PRECISION_init(&vpp1); ASSERT(g.n_flavours==2); @@ -1466,64 +1469,64 @@ void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l int ivs = l->inner_vector_size; - PUBLIC_MALLOC( vd1, complex_double, 4*ivs + 2*4*ivs ); - PUBLIC_MALLOC( vpp1, complex_PRECISION, 2*2*ivs ); + PUBLIC_MALLOC( vd1.vector_buffer, complex_double, 4*ivs + 2*4*ivs ); + PUBLIC_MALLOC( vpp1.vector_buffer, complex_PRECISION, 2*2*ivs ); - vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; - vdd1 = vd4 + ivs; vdd2 = vdd1 + 2*ivs; vdd3 = vdd2 + 2*ivs; vdd4 = vdd3 + 2*ivs; - vpp2 = vpp1 + 2*ivs; + vd2.vector_buffer = vd1.vector_buffer + ivs; vd3.vector_buffer = vd2.vector_buffer + ivs; vd4.vector_buffer = vd3.vector_buffer + ivs; + vdd1.vector_buffer = vd4.vector_buffer + ivs; vdd2.vector_buffer = vdd1.vector_buffer + 2*ivs; vdd3.vector_buffer = vdd2.vector_buffer + 2*ivs; vdd4.vector_buffer = vdd3.vector_buffer + 2*ivs; + vpp2.vector_buffer = vpp1.vector_buffer + 2*ivs; START_LOCKED_MASTER(threading) - vector_double_define_random( vd1, 0, l->inner_vector_size, l ); - vector_double_define_random( vd2, 0, l->inner_vector_size, l ); - apply_operator_double( vd3, vd1, &(g.p), l, no_threading ); + vector_double_define_random( &vd1, 0, l->inner_vector_size, l ); + vector_double_define_random( &vd2, 0, l->inner_vector_size, l ); + apply_operator_double( &vd3, &vd1, &(g.p), l, no_threading ); #ifdef HAVE_TM - vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); + buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); #endif - apply_operator_double( vd4, vd2, &(g.p), l, no_threading ); + apply_operator_double( &vd4, &vd2, &(g.p), l, no_threading ); #ifdef HAVE_TM - vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); + buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); #endif - add_diagonal_double( vd3, vd2, g.op_double.epsbar_term, l->inner_vector_size ); - add_diagonal_double( vd4, vd1, g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( &vd3, &vd2, g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( &vd4, &vd1, g.op_double.epsbar_term, l->inner_vector_size ); - two_flavours_to_serial_double( vd1, vd2, vdd1, l, no_threading ); - two_flavours_to_serial_double( vd3, vd4, vdd2, l, no_threading ); + two_flavours_to_serial_double( &vd1, &vd2, &vdd1, l, no_threading ); + two_flavours_to_serial_double( &vd3, &vd4, &vdd2, l, no_threading ); END_LOCKED_MASTER(threading) data_layout_n_flavours( 2, l, threading ); START_LOCKED_MASTER(threading) - trans_PRECISION( vpp1, vdd1, op->translation_table, l, no_threading ); - apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, no_threading ); + trans_PRECISION( &vpp1, &vdd1, op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vpp2, &vpp1, &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION( &vdd3, &vpp2, op->translation_table, l, no_threading ); - vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) / - global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading ); + vector_double_minus( &vdd4, &vdd3, &vdd2, 0, l->inner_vector_size, l ); + diff = global_norm_double( &vdd4, 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( &vdd3, 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - trans_PRECISION( vpp1, vdd1, op->translation_table, l, threading ); - apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, threading ); - trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, threading ); + trans_PRECISION( &vpp1, &vdd1, op->translation_table, l, threading ); + apply_operator_PRECISION( &vpp2, &vpp1, &(l->p_PRECISION), l, threading ); + trans_back_PRECISION( &vdd3, &vpp2, op->translation_table, l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) START_LOCKED_MASTER(threading) - vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) / - global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading ); + vector_double_minus( &vdd4, &vdd3, &vdd2, 0, l->inner_vector_size, l ); + diff = global_norm_double( &vdd4, 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( &vdd3, 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION with threading: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) } - PUBLIC_FREE( vd1, complex_double, 4*ivs + 2*4*ivs ); - PUBLIC_FREE( vpp1, complex_PRECISION, 2*2*ivs ); + PUBLIC_FREE( vd1.vector_buffer, complex_double, 4*ivs + 2*4*ivs ); + PUBLIC_FREE( vpp1.vector_buffer, complex_PRECISION, 2*2*ivs ); START_LOCKED_MASTER(threading) if ( g.method >=4 && g.odd_even ) diff --git a/src/dirac_generic.h b/src/dirac_generic.h index 1224f78..672c718 100644 --- a/src/dirac_generic.h +++ b/src/dirac_generic.h @@ -24,20 +24,20 @@ struct Thread; - void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ); - void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ); + void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ); + void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ); - void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); + void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); - void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ); - void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l ); - void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); - void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l); + void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void d_plus_clover_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); + void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ); + void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ); + void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); + void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l); void operator_updates_PRECISION( level_struct *l, struct Thread *threading ); void m0_update_PRECISION( PRECISION m0,operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op, @@ -46,22 +46,22 @@ level_struct *l, struct Thread *threading ); void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, + void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, complex_double even, complex_double odd, level_struct *l, struct Thread *threading ); - static inline void add_diagonal_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, + static inline void add_diagonal_PRECISION( const vector_PRECISION *eta, const vector_PRECISION *phi, const config_PRECISION diag, const int length ) { config_PRECISION diag_pt = diag; - vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length; + buffer_PRECISION phi_pt = phi->vector_buffer, eta_pt = eta->vector_buffer, eta_end = eta->vector_buffer + length; #ifdef HAVE_TM1p1 if(g.n_flavours == 2) while ( eta_pt < eta_end ) { @@ -79,10 +79,10 @@ } #ifdef HAVE_TM1p1 - static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, + static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION *eta, const vector_PRECISION *phi, const config_PRECISION diag, const int length ) { config_PRECISION diag_pt = diag; - vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length; + buffer_PRECISION phi_pt = phi->vector_buffer, eta_pt = eta->vector_buffer, eta_end = eta->vector_buffer + length; while ( eta_pt < eta_end ) { phi_pt += 6; FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) @@ -95,7 +95,7 @@ #endif // eta = D*phi - static inline void mvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void mvm_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = D[0]*phi[0]; eta[0] += D[1]*phi[1]; eta[0] += D[2]*phi[2]; @@ -108,7 +108,7 @@ } // eta = D**H*phi - static inline void mvmh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void mvmh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = conj_PRECISION(D[0])*phi[0]; eta[1] = conj_PRECISION(D[1])*phi[0]; eta[2] = conj_PRECISION(D[2])*phi[0]; @@ -121,7 +121,7 @@ } // eta = -D*phi - static inline void nmvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void nmvm_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = - D[0]*phi[0]; eta[0] -= D[1]*phi[1]; eta[0] -= D[2]*phi[2]; @@ -134,7 +134,7 @@ } // eta = -D**H*phi - static inline void nmvmh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void nmvmh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = - conj_PRECISION(D[0])*phi[0]; eta[1] = - conj_PRECISION(D[1])*phi[0]; eta[2] = - conj_PRECISION(D[2])*phi[0]; @@ -147,7 +147,7 @@ } // 1 - gamma_T - static inline void prp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void prp_T_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+2]; @@ -157,7 +157,7 @@ } // 1 + gamma_T - static inline void prn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prn_T_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+2]; @@ -167,7 +167,7 @@ } // - (1 - gamma_T) - static inline void pbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_T_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -183,7 +183,7 @@ } // -(1 + gamma_T) - static inline void pbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_T_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -198,7 +198,7 @@ l_pt[11] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[3*GAMMA_T_SPIN3_CO+2]; } - static inline void prp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void prp_Z_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+2]; @@ -207,7 +207,7 @@ prp_pt[5] = l_pt[5] -GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO+2]; } - static inline void prn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prn_Z_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+2]; @@ -216,7 +216,7 @@ prn_pt[5] = l_pt[5] +GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO+2]; } - static inline void pbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_Z_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -231,7 +231,7 @@ l_pt[11] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[3*GAMMA_Z_SPIN3_CO+2]; } - static inline void pbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_Z_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -246,7 +246,7 @@ l_pt[11] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[3*GAMMA_Z_SPIN3_CO+2]; } - static inline void prp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void prp_Y_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+2]; @@ -255,7 +255,7 @@ prp_pt[5] = l_pt[5] -GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO+2]; } - static inline void prn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prn_Y_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+2]; @@ -264,7 +264,7 @@ prn_pt[5] = l_pt[5] +GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO+2]; } - static inline void pbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_Y_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -279,7 +279,7 @@ l_pt[11] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[3*GAMMA_Y_SPIN3_CO+2]; } - static inline void pbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_Y_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -294,7 +294,7 @@ l_pt[11] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[3*GAMMA_Y_SPIN3_CO+2]; } - static inline void prp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void prp_X_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+2]; @@ -303,7 +303,7 @@ prp_pt[5] = l_pt[5] -GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO+2]; } - static inline void prn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prn_X_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+2]; @@ -312,7 +312,7 @@ prn_pt[5] = l_pt[5] +GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO+2]; } - static inline void pbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_X_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -327,7 +327,7 @@ l_pt[11] += GAMMA_X_SPIN3_VAL*prp_su3_pt[3*GAMMA_X_SPIN3_CO+2]; } - static inline void pbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_X_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -349,7 +349,7 @@ #define flav_gamma(k) (3*(k)+6*((k)/2)) // 1 - gamma_T - static inline void dprp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_T_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2]; @@ -365,7 +365,7 @@ } // 1 + gamma_T - static inline void dprn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_T_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2]; @@ -381,7 +381,7 @@ } // - (1 - gamma_T) - static inline void dpbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_T_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -409,7 +409,7 @@ } // -(1 + gamma_T) - static inline void dpbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_T_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -438,7 +438,7 @@ // 1 - gamma_Z - static inline void dprp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_Z_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2]; @@ -454,7 +454,7 @@ } // 1 + gamma_Z - static inline void dprn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_Z_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2]; @@ -470,7 +470,7 @@ } // - (1 - gamma_Z) - static inline void dpbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_Z_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -498,7 +498,7 @@ } // -(1 + gamma_Z) - static inline void dpbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_Z_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -527,7 +527,7 @@ // 1 - gamma_Y - static inline void dprp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_Y_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2]; @@ -543,7 +543,7 @@ } // 1 + gamma_Y - static inline void dprn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_Y_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2]; @@ -559,7 +559,7 @@ } // - (1 - gamma_Y) - static inline void dpbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_Y_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -587,7 +587,7 @@ } // -(1 + gamma_Y) - static inline void dpbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_Y_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -616,7 +616,7 @@ // 1 - gamma_X - static inline void dprp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_X_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2]; @@ -632,7 +632,7 @@ } // 1 + gamma_X - static inline void dprn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_X_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2]; @@ -648,7 +648,7 @@ } // - (1 - gamma_X) - static inline void dpbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_X_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -676,7 +676,7 @@ } // -(1 + gamma_X) - static inline void dpbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_X_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -706,7 +706,7 @@ #endif //END - static inline void twospin_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -733,7 +733,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -760,7 +760,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -787,7 +787,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin_p_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -814,7 +814,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -841,7 +841,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -868,7 +868,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin_p_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -895,7 +895,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -922,7 +922,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -949,7 +949,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin_p_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -976,7 +976,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -1003,7 +1003,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -1030,7 +1030,7 @@ out_spin2and3[11] -= in[11]; } - static inline void doublet_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void doublet_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; eta[ 1] = clover[ 1]*phi[ 1]; @@ -1182,7 +1182,7 @@ eta[23] += conj_PRECISION(clover[41])*phi[22]; } - static inline void spin0and1_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void spin0and1_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; eta[ 1] = clover[ 1]*phi[ 1]; @@ -1229,7 +1229,7 @@ eta[5] += conj_PRECISION(clover[26])*phi[4]; } - static inline void spin2and3_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void spin2and3_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = _COMPLEX_PRECISION_ZERO; eta[ 1] = _COMPLEX_PRECISION_ZERO; @@ -1276,7 +1276,7 @@ eta[11] += conj_PRECISION(clover[41])*phi[10]; } - static inline void site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; eta[ 1] = clover[ 1]*phi[ 1]; diff --git a/src/gathering_generic.c b/src/gathering_generic.c index 2eb10fc..d4952fb 100644 --- a/src/gathering_generic.c +++ b/src/gathering_generic.c @@ -28,8 +28,8 @@ void gathering_PRECISION_next_level_init( gathering_PRECISION_struct *gs, level_ gs->permutation = NULL; gs->gather_list = NULL; gs->reqs = NULL; - gs->buffer = NULL; - gs->transfer_buffer = NULL; + vector_PRECISION_init(&(gs->buffer)); + vector_PRECISION_init(&(gs->transfer_buffer)); gs->dist_inner_lattice_sites = 1; gs->gather_list_length = 1; @@ -49,9 +49,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL; MALLOC( process_list, int, l->num_processes ); #ifdef HAVE_TM1p1 - MALLOC( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + MALLOC( gs->transfer_buffer.vector_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #else - MALLOC( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + MALLOC( gs->transfer_buffer.vector_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #endif l->idle = 0; @@ -96,9 +96,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l MALLOC( gs->permutation, int, l->num_inner_lattice_sites ); MALLOC( gs->reqs, MPI_Request, gs->gather_list_length ); #ifdef HAVE_TM1p1 - MALLOC( gs->buffer, complex_PRECISION, 2*l->inner_vector_size ); + MALLOC( gs->buffer.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); #else - MALLOC( gs->buffer, complex_PRECISION, l->inner_vector_size ); + MALLOC( gs->buffer.vector_buffer, complex_PRECISION, l->inner_vector_size ); #endif MALLOC( field1, int, l->num_inner_lattice_sites ); MALLOC( field2, int, l->num_inner_lattice_sites ); @@ -213,18 +213,18 @@ void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l ) FREE( gs->permutation, int, l->num_inner_lattice_sites ); FREE( gs->reqs, MPI_Request, gs->gather_list_length ); #ifdef HAVE_TM1p1 - FREE( gs->buffer, complex_PRECISION, 2*l->inner_vector_size ); + FREE( gs->buffer.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); #else - FREE( gs->buffer, complex_PRECISION, l->inner_vector_size ); + FREE( gs->buffer.vector_buffer, complex_PRECISION, l->inner_vector_size ); #endif } MPI_Comm_free( &(gs->level_comm) ); MPI_Group_free( &(gs->level_comm_group) ); #ifdef HAVE_TM1p1 - FREE( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + FREE( gs->transfer_buffer.vector_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #else - FREE( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + FREE( gs->transfer_buffer.vector_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #endif } @@ -270,24 +270,30 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t, *pi = l->gs_PRECISION.permutation; - vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL; + vector_PRECISION buffer_hopp, buffer_clov, buffer_odd_proj; MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL, *odd_proj_reqs = NULL; + vector_PRECISION_init(&buffer_hopp); + vector_PRECISION_init(&buffer_clov); + vector_PRECISION_init(&buffer_odd_proj); + #ifdef HAVE_TM1p1 - vector_PRECISION buffer_eps_term = NULL; + vector_PRECISION buffer_eps_term; + vector_PRECISION_init(&buffer_eps_term); MPI_Request *eps_term_reqs = NULL; - MALLOC( buffer_eps_term, complex_PRECISION, n*send_size_block ); + MALLOC( buffer_eps_term.vector_buffer, complex_PRECISION, n*send_size_block ); MALLOC( eps_term_reqs, MPI_Request, n ); #endif #ifdef HAVE_TM - vector_PRECISION buffer_tm_term = NULL; + vector_PRECISION buffer_tm_term; + vector_PRECISION_init(&buffer_tm_term); MPI_Request *tm_term_reqs = NULL; - MALLOC( buffer_tm_term, complex_PRECISION, n*send_size_block ); + MALLOC( buffer_tm_term.vector_buffer, complex_PRECISION, n*send_size_block ); MALLOC( tm_term_reqs, MPI_Request, n ); #endif - MALLOC( buffer_hopp, complex_PRECISION, n*send_size_hopp ); - MALLOC( buffer_clov, complex_PRECISION, n*send_size_clov ); - MALLOC( buffer_odd_proj, complex_PRECISION, n*send_size_block ); + MALLOC( buffer_hopp.vector_buffer, complex_PRECISION, n*send_size_hopp ); + MALLOC( buffer_clov.vector_buffer, complex_PRECISION, n*send_size_clov ); + MALLOC( buffer_odd_proj.vector_buffer, complex_PRECISION, n*send_size_block ); MALLOC( hopp_reqs, MPI_Request, n ); MALLOC( clov_reqs, MPI_Request, n ); MALLOC( odd_proj_reqs, MPI_Request, n ); @@ -295,39 +301,39 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], 4, g.comm_cart, &(eps_term_reqs[i]) ); #endif #ifdef HAVE_TM - MPI_Irecv( buffer_tm_term+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_tm_term.vector_buffer+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 3, g.comm_cart, &(tm_term_reqs[i]) ); #endif - MPI_Irecv( buffer_hopp+i*send_size_hopp, send_size_hopp, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_hopp.vector_buffer+i*send_size_hopp, send_size_hopp, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 0, g.comm_cart, &(hopp_reqs[i]) ); - MPI_Irecv( buffer_clov+i*send_size_clov, send_size_clov, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_clov.vector_buffer+i*send_size_clov, send_size_clov, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 1, g.comm_cart, &(clov_reqs[i]) ); - MPI_Irecv( buffer_odd_proj+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_odd_proj.vector_buffer+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 2, g.comm_cart, &(odd_proj_reqs[i]) ); } PROF_PRECISION_STOP( _GD_COMM, 2*n-2 ); #ifdef HAVE_TM1p1 for ( i=0; iepsbar_term[i]; + buffer_eps_term.vector_buffer[i] = in->epsbar_term[i]; #endif #ifdef HAVE_TM for ( i=0; itm_term[i]; + buffer_tm_term.vector_buffer[i] = in->tm_term[i]; #endif for ( i=0; iD[i]; + buffer_hopp.vector_buffer[i] = in->D[i]; for ( i=0; iclover[i]; + buffer_clov.vector_buffer[i] = in->clover[i]; for ( i=0; iodd_proj[i]; + buffer_odd_proj.vector_buffer[i] = in->odd_proj[i]; #ifdef HAVE_TM1p1 PROF_PRECISION_START( _GD_IDLE ); @@ -338,7 +344,7 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s t = (send_size_block*n)/s; for ( i=0; iepsbar_term[ t*pi[i] + j ] = buffer_eps_term[ t*i + j ]; + out->epsbar_term[ t*pi[i] + j ] = buffer_eps_term.vector_buffer[ t*i + j ]; #endif #ifdef HAVE_TM PROF_PRECISION_START( _GD_IDLE ); @@ -349,7 +355,7 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s t = (send_size_block*n)/s; for ( i=0; itm_term[ t*pi[i] + j ] = buffer_tm_term[ t*i + j ]; + out->tm_term[ t*pi[i] + j ] = buffer_tm_term.vector_buffer[ t*i + j ]; #endif PROF_PRECISION_START( _GD_IDLE ); @@ -360,7 +366,7 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s t = (send_size_hopp*n)/s; for ( i=0; iD[ t*pi[i] + j ] = buffer_hopp[ t*i + j ]; + out->D[ t*pi[i] + j ] = buffer_hopp.vector_buffer[ t*i + j ]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iclover[ t*pi[i] + j ] = buffer_clov[ t*i + j ]; + out->clover[ t*pi[i] + j ] = buffer_clov.vector_buffer[ t*i + j ]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iodd_proj[ t*pi[i] + j ] = buffer_odd_proj[ t*i + j ]; + out->odd_proj[ t*pi[i] + j ] = buffer_odd_proj.vector_buffer[ t*i + j ]; - FREE( buffer_hopp, complex_PRECISION, n*send_size_hopp ); - FREE( buffer_clov, complex_PRECISION, n*send_size_clov ); - FREE( buffer_odd_proj, complex_PRECISION, n*send_size_block ); + FREE( buffer_hopp.vector_buffer, complex_PRECISION, n*send_size_hopp ); + FREE( buffer_clov.vector_buffer, complex_PRECISION, n*send_size_clov ); + FREE( buffer_odd_proj.vector_buffer, complex_PRECISION, n*send_size_block ); FREE( hopp_reqs, MPI_Request, n ); FREE( clov_reqs, MPI_Request, n ); FREE( odd_proj_reqs, MPI_Request, n ); #ifdef HAVE_TM - FREE( buffer_tm_term, complex_PRECISION, n*send_size_block ); + FREE( buffer_tm_term.vector_buffer, complex_PRECISION, n*send_size_block ); FREE( tm_term_reqs, MPI_Request, n ); #endif #ifdef HAVE_TM1p1 - FREE( buffer_eps_term, complex_PRECISION, n*send_size_block ); + FREE( buffer_eps_term.vector_buffer, complex_PRECISION, n*send_size_block ); FREE( eps_term_reqs, MPI_Request, n ); #endif @@ -408,12 +414,12 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s l->dummy_p_PRECISION.eval_operator = apply_coarse_operator_PRECISION; } -void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, level_struct *l ) { +void vector_PRECISION_gather( vector_PRECISION *gath, vector_PRECISION *dist, level_struct *l ) { int send_size = l->gs_PRECISION.dist_inner_lattice_sites * l->num_lattice_site_var; if ( g.my_rank != l->parent_rank ) { - MPI_Send( dist, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart ); + MPI_Send( dist->vector_buffer, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart ); } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t=l->num_lattice_site_var, *pi = l->gs_PRECISION.permutation; @@ -421,12 +427,12 @@ void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, leve PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], + MPI_Irecv( buffer.vector_buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], l->gs_PRECISION.gather_list[i], g.comm_cart, &(l->gs_PRECISION.reqs[i]) ); PROF_PRECISION_STOP( _GD_COMM, n-1 ); for ( i=0; ivector_buffer[i]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; ivector_buffer[ t*pi[i] + j ] = buffer.vector_buffer[ t*i + j ]; } } -void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath, level_struct *l ) { +void vector_PRECISION_distribute( vector_PRECISION *dist, vector_PRECISION *gath, level_struct *l ) { int send_size = l->gs_PRECISION.dist_inner_lattice_sites * l->num_lattice_site_var; if ( g.my_rank != l->parent_rank ) { - MPI_Recv( dist, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart, MPI_STATUS_IGNORE ); + MPI_Recv( dist->vector_buffer, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart, MPI_STATUS_IGNORE ); } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t=l->num_lattice_site_var, *pi = l->gs_PRECISION.permutation; @@ -453,16 +459,16 @@ void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath, // permute data according to desired distributed data layout for ( i=0; ivector_buffer[ t*pi[i]+j ]; PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], + MPI_Isend( buffer.vector_buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], l->gs_PRECISION.gather_list[i], g.comm_cart, &(l->gs_PRECISION.reqs[i]) ); PROF_PRECISION_STOP( _GD_COMM, n-1 ); for ( i=0; ivector_buffer[i] = buffer.vector_buffer[i]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iglobal_splitting[mu] > 1 ) { @@ -34,18 +34,18 @@ void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECI for ( i=0; inum_boundary_sites[2*i]; - buffer = l->vbuf_PRECISION[8]+n*(boundary_start-l->num_inner_lattice_sites); - buffer_pt = buffer; + buffer.vector_buffer = l->vbuf_PRECISION[8].vector_buffer+n*(boundary_start-l->num_inner_lattice_sites); + buffer_pt.vector_buffer = buffer.vector_buffer; for ( i=0; ivector_buffer + n*boundary_table[i]; + for ( j=0; jvector_buffer+n*boundary_start, n*num_boundary_sites, MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) ); - MPI_Isend( buffer, n*num_boundary_sites, MPI_COMPLEX_PRECISION, + MPI_Isend( buffer.vector_buffer, n*num_boundary_sites, MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) ); } } @@ -124,8 +124,8 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str c->length[2*mu] = buffer_size; c->length[2*mu+1] = buffer_size; c->max_length[mu] = factor*buffer_size; - MALLOC( c->buffer[2*mu], complex_PRECISION, factor*buffer_size ); - MALLOC( c->buffer[2*mu+1], complex_PRECISION, factor*buffer_size ); + MALLOC( c->buffer[2*mu].vector_buffer, complex_PRECISION, factor*buffer_size ); + MALLOC( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, factor*buffer_size ); c->in_use[2*mu] = 0; c->in_use[2*mu+1] = 0; } @@ -133,20 +133,20 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str for ( mu=0; mu<4; mu++ ) { c->max_length[mu] = buffer_size; #ifdef HAVE_TM1p1 - MALLOC( c->buffer[2*mu], complex_PRECISION, 2*buffer_size ); - MALLOC( c->buffer[2*mu+1], complex_PRECISION, 2*buffer_size ); + MALLOC( c->buffer[2*mu].vector_buffer, complex_PRECISION, 2*buffer_size ); + MALLOC( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, 2*buffer_size ); #else - MALLOC( c->buffer[2*mu], complex_PRECISION, buffer_size ); - MALLOC( c->buffer[2*mu+1], complex_PRECISION, buffer_size ); + MALLOC( c->buffer[2*mu].vector_buffer, complex_PRECISION, buffer_size ); + MALLOC( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, buffer_size ); #endif } } - if ( l->vbuf_PRECISION[8] == NULL ) { + if ( l->vbuf_PRECISION[8].vector_buffer == NULL ) { #ifdef HAVE_TM1p1 - MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size ); + MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); #else - MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size ); + MALLOC( l->vbuf_PRECISION[8]->vector_buffer, complex_PRECISION, l->vector_size ); #endif } } @@ -157,15 +157,15 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) { int mu; for ( mu=0; mu<4; mu++ ) { - FREE( c->buffer[2*mu], complex_PRECISION, c->max_length[mu] ); - FREE( c->buffer[2*mu+1], complex_PRECISION, c->max_length[mu] ); + FREE( c->buffer[2*mu].vector_buffer, complex_PRECISION, c->max_length[mu] ); + FREE( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, c->max_length[mu] ); } - if ( l->vbuf_PRECISION[8] != NULL ) { + if ( l->vbuf_PRECISION[8].vector_buffer != NULL ) { #ifdef HAVE_TM1p1 - FREE( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size ); + FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); #else - FREE( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size ); + FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size ); #endif } } @@ -185,7 +185,7 @@ void ghost_sendrecv_init_PRECISION( const int type, comm_PRECISION_struct *c, le } -void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir, +void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ) { // does not allow sending in both directions at the same time if( l->global_splitting[mu] > 1 ) { @@ -229,7 +229,7 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir ghost_alloc_PRECISION( MAX(length[0],length[1]), c, l ); } - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; // dir = senddir if ( dir == 1 ) { @@ -238,16 +238,16 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir // afterwards (in ghost_wait) the data has to be distributed onto the correct sites // touching the respective boundary in -mu direction - phi_pt = phi + comm_start; + phi_pt.vector_buffer = phi + comm_start; if ( length[1] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Irecv( buffer, length[1], MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer.vector_buffer, length[1], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu+1], 2*mu, g.comm_cart, &(c->rreqs[2*mu]) ); PROF_PRECISION_STOP( _OP_COMM, 1 ); } if ( length[0] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Isend( phi_pt, length[0], MPI_COMPLEX_PRECISION, + MPI_Isend( phi_pt.vector_buffer, length[0], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu], 2*mu, g.comm_cart, &(c->sreqs[2*mu]) ); PROF_PRECISION_STOP( _OP_COMM, 0 ); } @@ -261,25 +261,25 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir table = c->boundary_table[2*mu+1]+table_start; for ( j=0; jbuffer[mu_dir]; - phi_pt = phi + comm_start; + buffer = c->buffer[mu_dir]; + phi_pt.vector_buffer = phi + comm_start; if ( length[0] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Irecv( phi_pt, length[0], MPI_COMPLEX_PRECISION, + MPI_Irecv( phi_pt.vector_buffer, length[0], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) ); PROF_PRECISION_STOP( _OP_COMM, 1 ); } if ( length[1] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Isend( buffer, length[1], MPI_COMPLEX_PRECISION, + MPI_Isend( buffer.vector_buffer, length[1], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) ); PROF_PRECISION_STOP( _OP_COMM, 0 ); } @@ -289,7 +289,7 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir } -void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, +void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { @@ -322,7 +322,7 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, int num_boundary_sites = length[0]/offset; - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; table = c->boundary_table[2*mu+1] + table_start; if ( length[0] > 0 ) { @@ -338,21 +338,21 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, if ( l->depth == 0 ) { for ( j=0; jglobal_splitting[mu] > 1 ) { + printf0("hello"); int i, j, mu_dir = 2*mu-MIN(dir,0), nu, inv_mu_dir = 2*mu+1+MIN(dir,0), length, *table=NULL, comm_start, num_boundary_sites, site_var; vector_PRECISION buffer, recv_pt, phi_pt; @@ -385,7 +386,7 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, site_var = l->num_lattice_site_var; length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var; num_boundary_sites = c->num_boundary_sites[mu_dir]; - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; if ( dir == -1 ) comm_start = l->vector_size; @@ -398,28 +399,28 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, ASSERT( c->in_use[mu_dir] == 0 ); c->in_use[mu_dir] = 1; - recv_pt = phi + comm_start; + recv_pt.vector_buffer = phi->vector_buffer + comm_start; if ( length > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Irecv( recv_pt, length, MPI_COMPLEX_PRECISION, + MPI_Irecv( recv_pt.vector_buffer, length, MPI_COMPLEX_PRECISION, l->neighbor_rank[mu_dir], mu_dir, g.comm_cart, &(c->rreqs[mu_dir]) ); PROF_PRECISION_STOP( _OP_COMM, 1 ); } table = c->boundary_table[inv_mu_dir]; for ( j=0; jvector_buffer + table[j]*site_var; for ( i=0; ibuffer[mu_dir]; + buffer = c->buffer[mu_dir]; if ( length > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Isend( buffer, length, MPI_COMPLEX_PRECISION, + MPI_Isend( buffer.vector_buffer, length, MPI_COMPLEX_PRECISION, l->neighbor_rank[inv_mu_dir], mu_dir, g.comm_cart, &(c->sreqs[mu_dir]) ); PROF_PRECISION_STOP( _OP_COMM, 0 ); } @@ -427,9 +428,10 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, } -void ghost_update_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { +void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { + printf0("hello"); int mu_dir = 2*mu-MIN(dir,0), length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var; ASSERT( c->in_use[mu_dir] == 1 ); diff --git a/src/ghost_generic.h b/src/ghost_generic.h index 7b5b019..24d14b1 100644 --- a/src/ghost_generic.h +++ b/src/ghost_generic.h @@ -22,7 +22,7 @@ #ifndef GHOST_PRECISION_HEADER #define GHOST_PRECISION_HEADER - void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECISION_struct *c, level_struct *l ); + void negative_sendrecv_PRECISION( vector_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l ); // as negative_sendrecv_PRECISION, but for count vectors stored in phi in vector-fused data layout // buffer must be big enough to hold the surface data for count vectors (in one direction) @@ -32,12 +32,12 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l ); void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ); void ghost_sendrecv_init_PRECISION( const int type, comm_PRECISION_struct *c, level_struct *l ); - void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir, + void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ); - void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, + void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ); - void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); - void ghost_update_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); + void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); + void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); #endif diff --git a/src/init.c b/src/init.c index cd83ce4..e67e3da 100644 --- a/src/init.c +++ b/src/init.c @@ -646,8 +646,8 @@ void l_init( level_struct *l ) { level_double_init( l ); level_float_init( l ); - - l->x = NULL; + + vector_double_init(&(l->x)); l->next_level = NULL; l->reqs = NULL; } diff --git a/src/init_generic.c b/src/init_generic.c index c4b284f..b59a9f1 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -98,18 +98,19 @@ double prof_PRECISION_print( level_struct *l ) { void fine_level_PRECISION_alloc( level_struct *l ) { int n = 8; + vector_PRECISION_init(&(l->vbuf_PRECISION[0])); #ifdef HAVE_TM1p1 - MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size ); + MALLOC( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->vector_size ); for ( int i=1; ivbuf_PRECISION[i] = l->vbuf_PRECISION[0] + 2*i*l->vector_size; - MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size ); - l->p_PRECISION.x = l->p_PRECISION.b + 2*l->inner_vector_size; + l->vbuf_PRECISION[i].vector_buffer = l->vbuf_PRECISION[0].vector_buffer + 2*i*l->vector_size; + MALLOC( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->inner_vector_size ); + l->p_PRECISION.x.vector_buffer = l->p_PRECISION.b.vector_buffer + 2*l->inner_vector_size; #else - MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size ); + MALLOC( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->vector_size ); for ( int i=1; ivbuf_PRECISION[i] = l->vbuf_PRECISION[0] + i*l->vector_size; - MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size ); - l->p_PRECISION.x = l->p_PRECISION.b + l->inner_vector_size; + l->vbuf_PRECISION[i].vector_buffer = l->vbuf_PRECISION[0].vector_buffer + i*l->vector_size; + MALLOC( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); + l->p_PRECISION.x.vector_buffer = l->p_PRECISION.b.vector_buffer + l->inner_vector_size; #endif } @@ -117,19 +118,18 @@ void fine_level_PRECISION_alloc( level_struct *l ) { void fine_level_PRECISION_free( level_struct *l ) { int n = 8; - #ifdef HAVE_TM1p1 - FREE( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size ); + FREE( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->vector_size ); for ( int i=1; ivbuf_PRECISION[i] = NULL; - FREE( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size ); - l->p_PRECISION.x = NULL; + vector_PRECISION_init(&(l->vbuf_PRECISION[i])); + FREE( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->inner_vector_size ); + vector_PRECISION_init(&(l->p_PRECISION.x)); #else - FREE( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size ); + FREE( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->vector_size ); for ( int i=1; ivbuf_PRECISION[i] = NULL; - FREE( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size ); - l->p_PRECISION.x = NULL; + vector_PRECISION_init(&(l->vbuf_PRECISION[i])); + FREE( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); + vector_PRECISION_init(&(l->p_PRECISION.x)); #endif } @@ -158,12 +158,13 @@ void next_level_PRECISION_setup( level_struct *l ) { g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION, &(l->next_level->p_PRECISION), l->next_level ); } else { + vector_PRECISION_init(&(l->next_level->p_PRECISION.b)); #ifdef HAVE_TM1p1 - MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size ); - l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + 2*l->next_level->vector_size; + MALLOC( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->next_level->vector_size ); + l->next_level->p_PRECISION.x.vector_buffer = l->next_level->p_PRECISION.b.vector_buffer + 2*l->next_level->vector_size; #else - MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size ); - l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + l->next_level->vector_size; + MALLOC( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->next_level->vector_size ); + l->next_level->p_PRECISION.x.vector_buffer = l->next_level->p_PRECISION.b.vector_buffer + l->next_level->vector_size; #endif l->next_level->p_PRECISION.v_start = 0; l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size; @@ -171,14 +172,15 @@ void next_level_PRECISION_setup( level_struct *l ) { } int i, n = (l->next_level->level>0)?6:4; + vector_PRECISION_init(&(l->next_level->vbuf_PRECISION[0])); #ifdef HAVE_TM1p1 - MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size ); + MALLOC( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->next_level->vector_size ); for ( i=1; inext_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + 2*i*l->next_level->vector_size; + l->next_level->vbuf_PRECISION[i].vector_buffer = l->next_level->vbuf_PRECISION[0].vector_buffer + 2*i*l->next_level->vector_size; #else - MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size ); + MALLOC( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->next_level->vector_size ); for ( i=1; inext_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + i*l->next_level->vector_size; + l->next_level->vbuf_PRECISION[i].vector_buffer = l->next_level->vbuf_PRECISION[0].vector_buffer + i*l->next_level->vector_size; #endif } } @@ -193,19 +195,19 @@ void next_level_PRECISION_free( level_struct *l ) { fgmres_PRECISION_struct_free( &(l->next_level->p_PRECISION), l->next_level ); } else { #ifdef HAVE_TM1p1 - FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size ); + FREE( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->next_level->vector_size ); #else - FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size ); + FREE( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->next_level->vector_size ); #endif } int i, n = (l->next_level->level>0)?6:4; for ( i=1; inext_level->vbuf_PRECISION[i] = NULL; + vector_PRECISION_init(&(l->next_level->vbuf_PRECISION[i])); #ifdef HAVE_TM1p1 - FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size ); + FREE( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->next_level->vector_size ); #else - FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size ); + FREE( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->next_level->vector_size ); #endif coarsening_index_table_PRECISION_free( &(l->is_PRECISION), l ); } @@ -217,7 +219,7 @@ void next_level_PRECISION_free( level_struct *l ) { void level_PRECISION_init( level_struct *l ) { for ( int i=0; i<9; i++ ) - l->vbuf_PRECISION[i] = NULL; + vector_PRECISION_init(&(l->vbuf_PRECISION[i])); operator_PRECISION_init( &(l->op_PRECISION) ); operator_PRECISION_init( &(l->oe_op_PRECISION) ); @@ -231,20 +233,23 @@ void level_PRECISION_init( level_struct *l ) { void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading ) { ASSERT( g.mixed_precision ); - vector_PRECISION v1 = NULL, v2 = NULL; + vector_PRECISION v1, v2; + vector_PRECISION_init(&v1); + vector_PRECISION_init(&v2); + double t0=0, t1=0; - PUBLIC_MALLOC( v1, complex_PRECISION, l->inner_vector_size ); - PUBLIC_MALLOC( v2, complex_PRECISION, l->inner_vector_size ); + PUBLIC_MALLOC( v1.vector_buffer, complex_PRECISION, l->inner_vector_size ); + PUBLIC_MALLOC( v2.vector_buffer, complex_PRECISION, l->inner_vector_size ); START_LOCKED_MASTER(threading) - vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l ); + vector_PRECISION_define_random( &v2, 0, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) START_MASTER(threading) t0 = MPI_Wtime(); END_MASTER(threading) for ( int i=0; iinner_vector_size ); - PUBLIC_FREE( v2, complex_PRECISION, l->inner_vector_size ); + PUBLIC_FREE( v1.vector_buffer, complex_PRECISION, l->inner_vector_size ); + PUBLIC_FREE( v2.vector_buffer, complex_PRECISION, l->inner_vector_size ); END_LOCKED_MASTER(threading) } diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c index 8981bec..79b26fe 100644 --- a/src/interpolation_generic.c +++ b/src/interpolation_generic.c @@ -28,32 +28,32 @@ void interpolation_PRECISION_alloc( level_struct *l ) { int k, n = l->num_eig_vect; MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); - l->is_PRECISION.interpolation[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 64 ); + MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); + MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n ); + vector_PRECISION_init(&(l->is_PRECISION.interpolation[0])); + MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size, 64 ); for ( k=1; kis_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size; + l->is_PRECISION.interpolation[k].vector_buffer = l->is_PRECISION.interpolation[0].vector_buffer + k*l->vector_size; MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); - l->is_PRECISION.test_vector[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 64 ); + vector_PRECISION_init(&(l->is_PRECISION.test_vector[0])); + MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size, 64 ); for ( k=1; kis_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size; + l->is_PRECISION.test_vector[k].vector_buffer = l->is_PRECISION.test_vector[0].vector_buffer + k*l->inner_vector_size; } } void interpolation_PRECISION_dummy_alloc( level_struct *l ) { - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); + MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); + MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); } void interpolation_PRECISION_dummy_free( level_struct *l ) { - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); + FREE( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); + FREE( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); } @@ -61,11 +61,11 @@ void interpolation_PRECISION_free( level_struct *l ) { int n = l->num_eig_vect; - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size ); + FREE_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size ); FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n ); - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n ); + FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); + FREE_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size ); + FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); FREE( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); } @@ -90,24 +90,24 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, } -void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) { +void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _PR, threading ); int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); + vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); + vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; int sign = 1; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; int sign = 1; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( j=0; j<2*2*num_eig_vect; j++ ) @@ -280,8 +280,8 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str else #endif for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( j=0; j<2*num_eig_vect; j++ ) @@ -303,7 +303,7 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str SYNC_HYPERTHREADS(threading) START_LOCKED_MASTER(threading) - vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level ); + vector_PRECISION_gather( phi_c, &(l->next_level->gs_PRECISION.transfer_buffer), l->next_level ); END_LOCKED_MASTER(threading) PROF_PRECISION_STOP( _PR, 1, threading ); } diff --git a/src/interpolation_generic.h b/src/interpolation_generic.h index 97be6ec..79c146f 100644 --- a/src/interpolation_generic.h +++ b/src/interpolation_generic.h @@ -29,9 +29,9 @@ void interpolation_PRECISION_dummy_alloc( level_struct *l ); void interpolation_PRECISION_dummy_free( level_struct *l ); - void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ); - void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ); + void interpolate3_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ); + void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ); #endif diff --git a/src/io.c b/src/io.c index 02b5ceb..fe9272a 100644 --- a/src/io.c +++ b/src/io.c @@ -716,8 +716,9 @@ void vector_io( double *phi, char *filename, const int mode, level_struct *l ) { FREE( buffer[0].data, double, bar_size ); FREE( buffer[1].data, double, bar_size ); } - - norm = global_norm_double( (vector_double)phi, 0, l->inner_vector_size, l, no_threading ); + vector_double phi_vec; + phi_vec.vector_buffer = (buffer_double) phi; + norm = global_norm_double( &phi_vec, 0, l->inner_vector_size, l, no_threading ); printf0("norm: %e\n", norm ); printf0("...done (%lf seconds)\n\n", t1-t0 ); } @@ -871,7 +872,7 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i ASSERT( fread( buffer_pt->data, sizeof(double), bar_size, file ) ); } - phi=(double *) (l->x); + phi=(double *) (&(l->x)); phi_pt=phi; for ( t=0; tis_float.test_vector[j], l->x, l->s_float.op.translation_table, l, no_threading); + trans_float(&(l->is_float.test_vector[j]), &(l->x), l->s_float.op.translation_table, l, no_threading); else - trans_double(l->is_double.test_vector[j], l->x, l->s_double.op.translation_table, l, no_threading); + trans_double(&(l->is_double.test_vector[j]), &(l->x), l->s_double.op.translation_table, l, no_threading); } else { - vector_double_copy( ((vector_double)psi)+j*l->inner_vector_size, l->x, 0, l->inner_vector_size, l ); + vector_double psi_vec; + psi_vec.vector_buffer = ((buffer_double) psi) + j*l->inner_vector_size; + vector_double_copy( &psi_vec, &(l->x), 0, l->inner_vector_size, l ); } } } else if ( mode == _WRITE ) { @@ -927,13 +930,15 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i for ( j=0; jx, l->is_float.test_vector[j], l->s_float.op.translation_table, l, no_threading ); + trans_back_float( &(l->x), &(l->is_float.test_vector[j]), l->s_float.op.translation_table, l, no_threading ); else - trans_back_double( l->x, l->is_double.test_vector[j], l->s_double.op.translation_table, l, no_threading ); + trans_back_double( &(l->x), &(l->is_double.test_vector[j]), l->s_double.op.translation_table, l, no_threading ); } else { - vector_double_copy( l->x, ((complex_double*)psi)+j*l->inner_vector_size, 0, l->inner_vector_size, l ); + vector_double psi_vec; + psi_vec.vector_buffer = ((complex_double*)psi)+j*l->inner_vector_size; + vector_double_copy( &(l->x), &psi_vec, 0, l->inner_vector_size, l ); } - phi=(double *)(l->x); + phi=(double *)(&(l->x)); phi_pt=phi; for ( t=0; tnext->data, bar_size, MPI_DOUBLE, desired_rank, 0, g.comm_cart, &rreq ); } diff --git a/src/linalg.c b/src/linalg.c index 3487404..06d3961 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -23,7 +23,7 @@ #ifndef OPTIMIZED_LINALG_float void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, - vector_float psi, int start, int end, level_struct *l, + vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_float_START( _PIP, threading ); @@ -39,7 +39,7 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); for(int c=0; cvector_buffer[i]; i++; ) } } @@ -62,7 +62,7 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ } #endif -double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) { +double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_float_START( _GIP, threading ); @@ -75,7 +75,7 @@ double global_norm_MP( vector_float x, int start, int end, level_struct *l, stru SYNC_CORES(threading) for ( i=thread_start; ivector_buffer[i]); i++; ) // sum over cores START_NO_HYPERTHREADS(threading) diff --git a/src/linalg.h b/src/linalg.h index 4182def..1343ced 100644 --- a/src/linalg.h +++ b/src/linalg.h @@ -24,16 +24,16 @@ struct Thread; - void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha, + void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_double *alpha, int sign, int count, int start, int end, level_struct *l ); - void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha, + void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float *alpha, int sign, int count, int start, int end, level_struct *l ); void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, - vector_float psi, int start, int end, level_struct *l, + vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ); - double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ); + double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linalg_generic.c b/src/linalg_generic.c index db223bd..6a0e2cd 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -26,7 +26,7 @@ #include "sse_linalg_PRECISION.h" #ifndef OPTIMIZED_LINALG_PRECISION -complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { +complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); complex_PRECISION local_alpha = 0, global_alpha = 0; @@ -37,7 +37,7 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_P SYNC_CORES(threading) - VECTOR_FOR( int i=thread_start, ivector_buffer[i])*psi->vector_buffer[i], i++, l ); // sum over cores START_NO_HYPERTHREADS(threading) @@ -74,7 +74,7 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_P #endif -complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { +complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _PIP, threading ); int i; @@ -82,7 +82,7 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_ SYNC_CORES(threading) - THREADED_VECTOR_FOR( i, start, end, local_alpha += conj_PRECISION(phi[i])*psi[i], i++, l, threading ); + THREADED_VECTOR_FOR( i, start, end, local_alpha += conj_PRECISION(phi->vector_buffer[i])*psi->vector_buffer[i], i++, l, threading ); START_NO_HYPERTHREADS(threading) ((complex_PRECISION *)threading->workspace)[threading->core] = local_alpha; @@ -104,7 +104,7 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_ #if !defined( OPTIMIZED_LINALG_PRECISION ) -void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi, +void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _PIP, threading ); @@ -120,18 +120,18 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); for(int c=0; cvector_buffer[i]; i++; ) } else { #ifdef _M10TV compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 20); for(int c=0; cvector_buffer[i]; i++; ) #else compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2); for(int c=0; cvector_buffer[i]; i++; ) #endif } @@ -155,11 +155,11 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result #endif -complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l ) { +complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ) { complex_PRECISION numerator = 0.0; PRECISION denominator = 0.0; - VECTOR_FOR( int i=start, ivector_buffer[i])*psi->vector_buffer[i]; denominator += NORM_SQUARE_PRECISION(phi->vector_buffer[i]), i++, l ); if ( abs_PRECISION(denominator) < EPS_PRECISION ) { return 0.0; @@ -169,7 +169,7 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECI } #ifndef OPTIMIZED_LINALG_PRECISION -PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) { +PRECISION global_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); @@ -181,7 +181,7 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s SYNC_CORES(threading) - VECTOR_FOR( int i=thread_start, ivector_buffer[i]), i++, l ); // sum over cores START_NO_HYPERTHREADS(threading) @@ -217,7 +217,7 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s } #endif -PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) { +PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) { int i; PRECISION local_alpha = 0; @@ -225,7 +225,7 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_ SYNC_CORES(threading) - THREADED_VECTOR_FOR( i, start, end, local_alpha += NORM_SQUARE_PRECISION(x[i]), i++, l, threading ); + THREADED_VECTOR_FOR( i, start, end, local_alpha += NORM_SQUARE_PRECISION(x->vector_buffer[i]), i++, l, threading ); START_NO_HYPERTHREADS(threading) ((PRECISION *)threading->workspace)[threading->core] = local_alpha; @@ -246,39 +246,39 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_ } -void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) { +void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i] + y->vector_buffer[i], i++, l ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); } -void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) { +void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i] - y->vector_buffer[i], i++, l ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); } #ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ) { +void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = alpha*x->vector_buffer[i], i++, l ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); @@ -286,24 +286,24 @@ void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRE #endif -void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, +void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { - + PRECISION *r_z = (PRECISION*)z, *r_x = (PRECISION*)x, r_alpha = creal_PRECISION(alpha); int r_start = 2*start, r_end = 2*end; - + int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - + REAL_VECTOR_FOR( int i=r_start, iinner_vector_size ); } -void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ) { +void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -316,13 +316,13 @@ void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, i } #ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ) { +void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if (thread == 0 && start != end ) PROF_PRECISION_START( _LA8 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i] + alpha*y->vector_buffer[i], i++, l ); if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); @@ -330,7 +330,7 @@ void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PREC #endif #ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha, +void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, int count, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); @@ -344,7 +344,7 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, comp for ( int c=0; cvector_buffer[i] += V[c].vector_buffer[i]*alpha_signed[c]; i++; ) } } @@ -353,27 +353,29 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, comp } #endif -void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, +void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION **W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ) { int j, start, end; compute_core_start_end( 0, l->inner_vector_size, &start, &end, l, threading ); - vector_PRECISION v_tmp = NULL, *W_tmp = NULL; + vector_PRECISION v_tmp, *W_tmp = NULL; complex_PRECISION ip[k], ip_buffer[2*k]; - MALLOC( v_tmp, complex_PRECISION, l->inner_vector_size ); - vector_PRECISION_define(v_tmp, 0, 0, l->inner_vector_size, l ); + vector_PRECISION_init(&v_tmp); + + MALLOC( v_tmp.vector_buffer, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_define(&v_tmp, 0, 0, l->inner_vector_size, l ); - MALLOC( W_tmp, complex_PRECISION*, k ); - W_tmp[0] = NULL; - MALLOC( W_tmp[0], complex_PRECISION, k*l->inner_vector_size ); + MALLOC( W_tmp, vector_PRECISION, k ); + vector_PRECISION_init(&W_tmp[0]); + MALLOC( W_tmp[0].vector_buffer, complex_PRECISION, k*l->inner_vector_size ); for ( j = 1; jinner_vector_size; + W_tmp[j].vector_buffer = W_tmp[0].vector_buffer+j*l->inner_vector_size; for ( j=0; jinner_vector_size, l ); + vector_PRECISION_scale( &W_tmp[j], W[j], diag[j], 0, l->inner_vector_size, l ); } process_multi_inner_product_PRECISION( k, ip, W_tmp, v, 0, l->inner_vector_size, l, threading ); @@ -385,16 +387,16 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) - vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l ); + vector_PRECISION_multi_saxpy( &v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l ); if (orthogonal) - vector_PRECISION_minus( z, v, v_tmp, 0, l->inner_vector_size, l ); + vector_PRECISION_minus( z, v, &v_tmp, 0, l->inner_vector_size, l ); else - vector_PRECISION_copy( z, v_tmp, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( z, &v_tmp, 0, l->inner_vector_size, l ); - FREE( v_tmp, complex_PRECISION, l->inner_vector_size ); - FREE( W_tmp[0], complex_PRECISION, k*l->inner_vector_size ); - FREE( W_tmp, complex_PRECISION*, k ); + FREE( v_tmp.vector_buffer, complex_PRECISION, l->inner_vector_size ); + FREE( W_tmp[0].vector_buffer, complex_PRECISION, k*l->inner_vector_size ); + FREE( W_tmp, vector_PRECISION, k ); } void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vect, level_struct *l, struct Thread *threading ) { @@ -411,23 +413,23 @@ void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_ve for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { for ( k1=0; k1 V[k2] | 2*j-th and 2*j+1-st aggregate for ( i=0; ivector_buffer[i] = alpha*phi->vector_buffer[i]; eta2->vector_buffer[i] = _COMPLEX_PRECISION_ZERO; i++; ) + FOR6( eta2->vector_buffer[i] = alpha*phi->vector_buffer[i]; eta1->vector_buffer[i] = _COMPLEX_PRECISION_ZERO; i++; ) } PROF_PRECISION_STOP( _LA6, 1 ); } -void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) { +void set_boundary_PRECISION( vector_PRECISION *phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _SET, threading ); int i; SYNC_CORES(threading) - THREADED_VECTOR_FOR( i, l->inner_vector_size, l->vector_size, phi[i] = alpha, i++, l, threading ); + THREADED_VECTOR_FOR( i, l->inner_vector_size, l->vector_size, phi->vector_buffer[i] = alpha, i++, l, threading ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SET, (double)(l->vector_size-l->inner_vector_size)/(double)l->inner_vector_size, threading ); @@ -496,7 +498,7 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con for ( i=begin; iinner_vector_size, l, threading ); + process_multi_inner_product_PRECISION( i, tmp, V, &V[i], 0, l->inner_vector_size, l, threading ); SYNC_CORES(threading) START_MASTER(threading) for ( j=0; jinner_vector_size, l, threading ); + beta = global_norm_PRECISION( &V[i], 0, l->inner_vector_size, l, threading ); SYNC_MASTER_TO_ALL(threading) - vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l ); + vector_PRECISION_real_scale( &V[i], &V[i], creal(1.0/beta), start, end, l ); SYNC_CORES(threading) } @@ -543,6 +545,8 @@ void setup_gram_schmidt_PRECISION_compute_dots( int thread_end; int cache_block_size = 12*64; complex_PRECISION tmp[cache_block_size]; + vector_PRECISION tmp_vect; + tmp_vect.vector_buffer = tmp; for(int i=0; i<2*offset; i++) thread_buffer[i] = 0.0; @@ -551,11 +555,11 @@ void setup_gram_schmidt_PRECISION_compute_dots( compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); for ( int i=thread_start; idepth > 0 ) { - coarse_gamma5_PRECISION( g5v, V[i], thread_start, thread_end, l ); + coarse_gamma5_PRECISION( g5v, &V[i], thread_start, thread_end, l ); for ( j=0; jdepth > 0 ) { for( j=0; jinner_vector_size, threading ); } diff --git a/src/linalg_generic.h b/src/linalg_generic.h index 9bd7a20..2ef5dc1 100644 --- a/src/linalg_generic.h +++ b/src/linalg_generic.h @@ -99,24 +99,24 @@ struct Thread; - complex_PRECISION global_inner_product_PRECISION( vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l, struct Thread *threading ); - complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ); + complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l, struct Thread *threading ); + complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ); - void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi, + void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ); - PRECISION global_norm_PRECISION( vector_PRECISION phi, int start, int end, level_struct *l, struct Thread *threading ); - PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ); + PRECISION global_norm_PRECISION( vector_PRECISION *phi, int start, int end, level_struct *l, struct Thread *threading ); + PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ); - complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l ); - void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x + y - void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x - y - void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x - void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, + complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ); + void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x + y + void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x - y + void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x + void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); - void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y - void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ); // z := x - void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, + void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y + void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ); // z := x + void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION **W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ); void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); @@ -135,11 +135,11 @@ int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n, level_struct *l, struct Thread *threading ); - void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v, + void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION *g5v, complex_PRECISION *buffer, const int n, level_struct *l, struct Thread *threading ); - void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, complex_PRECISION alpha, int start, int end, level_struct *l ); - void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ); + void spinwise_PRECISION_skalarmultiply( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, complex_PRECISION alpha, int start, int end, level_struct *l ); + void set_boundary_PRECISION( vector_PRECISION *phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linsolve.c b/src/linsolve.c index bc24c81..b6ca32f 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -90,11 +90,11 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr // s p->dp.s = p->dp.H[0] + total; total += m+1; // x - p->dp.x = p->dp.H[0] + total; total += vl; + p->dp.x.vector_buffer = p->dp.H[0] + total; total += vl; // r - p->dp.r = p->dp.H[0] + total; total += vl; + p->dp.r.vector_buffer = p->dp.H[0] + total; total += vl; // b - p->dp.b = p->dp.H[0] + total; total += vl; + p->dp.b.vector_buffer = p->dp.H[0] + total; total += vl; ASSERT( p->dp.total_storage == total ); @@ -102,7 +102,7 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr // single precision part total = 0; total += (2+m)*vl; // w, V - MALLOC( p->sp.V, complex_float*, m+1 ); + MALLOC( p->sp.V, vector_float, m+1 ); if ( precond != NULL ) { if ( prec_kind == _RIGHT ) { total += (m+1)*vl; // Z @@ -111,26 +111,26 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr total += vl; k = 1; } - MALLOC( p->sp.Z, complex_float*, k ); + MALLOC( p->sp.Z, vector_float, k ); } p->sp.total_storage = total; // precomputed storage amount - p->sp.w = NULL; - MALLOC( p->sp.w, complex_float, total ); + vector_float_init(&(p->sp.w)); + MALLOC( p->sp.w.vector_buffer, complex_float, total ); // reserve storage total = 0; // w - p->sp.w = p->sp.w + total; total += vl; + p->sp.w.vector_buffer = p->sp.w.vector_buffer + total; total += vl; // V for ( i=0; isp.V[i] = p->sp.w + total; total += vl; + p->sp.V[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; } // Z if ( precond != NULL ) { for ( i=0; isp.Z[i] = p->sp.w + total; total += vl; + p->sp.Z[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; } } @@ -141,10 +141,10 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr void fgmres_MP_struct_free( gmres_MP_struct *p ) { // single precision - FREE( p->sp.w, complex_float, p->sp.total_storage ); - FREE( p->sp.V, complex_float*, p->sp.restart_length+1 ); + FREE( p->sp.w.vector_buffer, complex_float, p->sp.total_storage ); + FREE( p->sp.V, vector_float, p->sp.restart_length+1 ); if ( p->sp.Z != NULL ) - FREE( p->sp.Z, complex_float*, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 ); + FREE( p->sp.Z, vector_float, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 ); // double precision FREE( p->dp.H[0], complex_double, p->dp.total_storage ); @@ -191,12 +191,12 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { for( ol=0; oldp.num_restart && finish==0; ol++ ) { if( ol == 0 && p->dp.initial_guess_zero ) { - vector_double_copy( p->dp.r, p->dp.b, start, end, l ); + vector_double_copy( &(p->dp.r), &(p->dp.b), start, end, l ); } else { - apply_operator_double( p->dp.r, p->dp.x, &(p->dp), l, threading ); // compute r <- D*x - vector_double_minus( p->dp.r, p->dp.b, p->dp.r, start, end, l ); // compute r <- b - r + apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); // compute r <- D*x + vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); // compute r <- b - r } - gamma0 = (complex_double) global_norm_double( p->dp.r, p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) + gamma0 = (complex_double) global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) START_MASTER(threading) p->dp.gamma[0] = gamma0; END_MASTER(threading) @@ -204,7 +204,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { if( ol == 0) { if (l->depth == 0 && !p->dp.initial_guess_zero) { - norm_r0 = global_norm_double( p->dp.b, start, end, l, threading ); + norm_r0 = global_norm_double( &(p->dp.b), start, end, l, threading ); printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); } else { norm_r0 = creal(gamma0); @@ -222,13 +222,13 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { } #endif - trans_float( p->sp.V[0], p->dp.r, l->s_float.op.translation_table, l, threading ); - vector_float_real_scale( p->sp.V[0], p->sp.V[0], (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 + trans_float( &(p->sp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); + vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 // inner loop in single precision for( il=0; ildp.restart_length && finish==0; il++) { j = il; iter++; - arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); + arnoldi_step_MP( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) { qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading ); @@ -254,14 +254,14 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { finish = 1; } } // end of a single restart - compute_solution_MP( p->sp.w, (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V, + compute_solution_MP( &(p->sp.w), (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V, p->dp.y, p->dp.gamma, p->dp.H, j, &(p->sp), l, threading ); - trans_back_float( p->dp.r, p->sp.w, l->s_float.op.translation_table, l, threading ); + trans_back_float( &(p->dp.r), &(p->sp.w), l->s_float.op.translation_table, l, threading ); if ( ol == 0 ) { - vector_double_copy( p->dp.x, p->dp.r, start, end, l ); + vector_double_copy( &(p->dp.x), &(p->dp.r), start, end, l ); } else { - vector_double_plus( p->dp.x, p->dp.x, p->dp.r, start, end, l ); + vector_double_plus( &(p->dp.x), &(p->dp.x), &(p->dp.r), start, end, l ); } } // end of fgmres @@ -271,9 +271,9 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { if ( p->dp.print ) { #ifdef FGMRES_RESTEST - apply_operator_double( p->dp.r, p->dp.x, &(p->dp), l, threading ); - vector_double_minus( p->dp.r, p->dp.b, p->dp.r, start, end, l ); - beta = global_norm_double( p->dp.r, p->dp.v_start, p->dp.v_end, l, threading ); + apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); + vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); + beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); #else beta = gamma_jp1; #endif @@ -321,7 +321,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { } -void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, +void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float *w, complex_double **H, complex_double* buffer, int j, void (*prec)(), gmres_float_struct *p, level_struct *l, struct Thread *threading ) { @@ -337,19 +337,19 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, if ( prec != NULL ) { if ( p->kind == _LEFT ) { - apply_operator_float( Z[0], V[j], p, l, threading ); - prec( w, NULL, Z[0], _NO_RES, l, threading ); + apply_operator_float( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { - prec( Z[j], w, V[j], _NO_RES, l, threading ); + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); // obtains w = D * Z[j] from Schwarz } else { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_float( w, Z[j], p, l, threading ); // w = D*Z[j] + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_float( w, &Z[j], p, l, threading ); // w = D*Z[j] } } } else { - apply_operator_float( w, V[j], p, l, threading ); // w = D*V[j] + apply_operator_float( w, &V[j], p, l, threading ); // w = D*V[j] } complex_double tmp[j+1]; @@ -383,11 +383,11 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, // V_j+1 = w / H_j+1,j if ( cabs_double( H[j][j+1] ) > 1e-15 ) - vector_float_real_scale( V[j+1], w, (float)(1/H[j][j+1]), start, end, l ); + vector_float_real_scale( &V[j+1], w, (float)(1/H[j][j+1]), start, end, l ); } -void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, +void compute_solution_MP( vector_float *x, vector_float *V, complex_double *y, complex_double *gamma, complex_double **H, int j, gmres_float_struct *p, level_struct *l, struct Thread *threading ) { @@ -418,12 +418,12 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, SYNC_MASTER_TO_ALL(threading) // x = V*y - vector_float_scale( x, V[0], (complex_float) y[0], start, end, l ); + vector_float_scale( x, &V[0], (complex_float) y[0], start, end, l ); complex_float alpha[j]; for ( i=1; i<=j; i++ ) alpha[i-1] = (complex_float) y[i]; - vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, start, end, l ); + vector_float_multi_saxpy( x, &V[1], alpha, 1, j, start, end, l ); } diff --git a/src/linsolve.h b/src/linsolve.h index 8ea985b..38426d1 100644 --- a/src/linsolve.h +++ b/src/linsolve.h @@ -27,11 +27,11 @@ struct Thread; - void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, + void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float *w, complex_double **H, complex_double* buffer, int j, void (*prec)(), gmres_float_struct *p, level_struct *l, struct Thread *threading ); - void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, + void compute_solution_MP( vector_float *x, vector_float *V, complex_double *y, complex_double *gamma, complex_double **H, int j, gmres_float_struct *p, level_struct *l, struct Thread *threading ); diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index c981963..c8eb06c 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -31,10 +31,10 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) { p->Z = NULL; p->V = NULL; p->H = NULL; - p->x = NULL; - p->b = NULL; - p->r = NULL; - p->w = NULL; + vector_PRECISION_init(&(p->x)); + vector_PRECISION_init(&(p->b)); + vector_PRECISION_init(&(p->r)); + vector_PRECISION_init(&(p->w)); p->y = NULL; p->gamma = NULL; p->c = NULL; @@ -80,7 +80,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co MALLOC( p->H, complex_PRECISION*, m ); total += (5+m)*vl; // x, r, b, w, V - MALLOC( p->V, complex_PRECISION*, m+1 ); + MALLOC( p->V, vector_PRECISION, m+1 ); if ( precond != NULL ) { if ( prec_kind == _RIGHT ) { @@ -90,13 +90,13 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co total += vl; k = 1; } - MALLOC( p->Z, complex_PRECISION*, k ); + MALLOC( p->Z, vector_PRECISION, k ); } else { #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { total += (m+2)*vl; k = m+2; - MALLOC( p->Z, complex_PRECISION*, k ); + MALLOC( p->Z, vector_PRECISION, k ); } #else k = 0; @@ -126,22 +126,22 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co // s p->s = p->H[0] + total; total += m+1; // w - p->w = p->H[0] + total; total += vl; + p->w.vector_buffer = p->H[0] + total; total += vl; // V for ( i=0; iV[i] = p->H[0] + total; total += vl; + p->V[i].vector_buffer = p->H[0] + total; total += vl; } // Z for ( i=0; iZ[i] = p->H[0] + total; total += vl; + p->Z[i].vector_buffer = p->H[0] + total; total += vl; } // x - p->x = p->H[0] + total; total += vl; + p->x.vector_buffer = p->H[0] + total; total += vl; // r - p->r = p->H[0] + total; total += vl; + p->r.vector_buffer = p->H[0] + total; total += vl; // b - p->b = p->H[0] + total; total += vl; + p->b.vector_buffer = p->H[0] + total; total += vl; ASSERT( p->total_storage == total ); } @@ -205,10 +205,10 @@ void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l ) if(p->restart_length > 0) { FREE( p->H[0], complex_PRECISION, p->total_storage ); FREE( p->H, complex_PRECISION*, p->restart_length ); - FREE( p->V, complex_PRECISION*, p->restart_length+1 ); + FREE( p->V, vector_PRECISION, p->restart_length+1 ); if ( p->Z != NULL ) - FREE( p->Z, complex_PRECISION*, k ); + FREE( p->Z, vector_PRECISION, k ); } p->D = NULL; @@ -253,11 +253,11 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if( ol == 0 && p->initial_guess_zero ) { res = _NO_RES; - vector_PRECISION_copy( p->r, p->b, start, end, l ); + vector_PRECISION_copy( &(p->r), &(p->b), start, end, l ); } else { res = _RES; if ( p->kind == _LEFT && p->preconditioner ) { - apply_operator_PRECISION( p->Z[0], p->x, p, l, threading ); + apply_operator_PRECISION( &(p->Z[0]), &(p->x), p, l, threading ); if ( g.method == 5 ) { START_LOCKED_MASTER(threading) g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); @@ -265,11 +265,11 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } p->preconditioner( p->w, NULL, p->Z[0], _NO_RES, l, threading ); } else { - apply_operator_PRECISION( p->w, p->x, p, l, threading ); // compute w = D*x + apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x } - vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); // compute r = b - w + vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w } - gamma0 = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) + gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) START_MASTER(threading) p->gamma[0] = gamma0; END_MASTER(threading); @@ -277,14 +277,14 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( ol == 0 ) { if (l->depth == 0 && !p->initial_guess_zero) { - norm_r0 = global_norm_PRECISION( p->b, p->v_start, p->v_end, l, threading ); + norm_r0 = global_norm_PRECISION( &(p->b), p->v_start, p->v_end, l, threading ); printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); } else { norm_r0 = creal(p->gamma[0]); } } - vector_PRECISION_real_scale( p->V[0], p->r, 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 + vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p, l, threading ); @@ -313,7 +313,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } } #else - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } @@ -343,7 +343,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread break; } } // end of a single restart - compute_solution_PRECISION( p->x, (p->preconditioner&&p->kind==_RIGHT)?p->Z:p->V, + compute_solution_PRECISION( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading ); } // end of fgmres @@ -353,9 +353,9 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( p->print ) { #ifdef FGMRES_RESTEST - apply_operator_PRECISION( p->w, p->x, p, l, threading ); - vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); - beta = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); + apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); + vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); + beta = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); #else beta = gamma_jp1; #endif @@ -439,13 +439,13 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr maxiter = 1000000; r = ps->r; b = ps->b; x = ps->x; p = ps->w; pp = ps->V[0]; r_tilde = ps->V[1]; v = ps->V[2]; s = ps->V[3]; t = ps->V[4]; - vector_PRECISION_copy( r, b, start, end, l ); - vector_PRECISION_copy( r_tilde, b, start, end, l ); - vector_PRECISION_define( x, 0, start, end, l ); - vector_PRECISION_define( v, 0, start, end, l ); - vector_PRECISION_define( s, 0, start, end, l ); - vector_PRECISION_define( t, 0, start, end, l ); - b_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_copy( &r, &b, start, end, l ); + vector_PRECISION_copy( &r_tilde, &b, start, end, l ); + vector_PRECISION_define( &x, 0, start, end, l ); + vector_PRECISION_define( &v, 0, start, end, l ); + vector_PRECISION_define( &s, 0, start, end, l ); + vector_PRECISION_define( &t, 0, start, end, l ); + b_norm = global_norm_PRECISION( &b, ps->v_start, ps->v_end, l, threading ); r_norm = b_norm; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -457,7 +457,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr iter++; rho_old = rho; - rho = global_inner_product_PRECISION( r_tilde, r, ps->v_start, ps->v_end, l, threading ); + rho = global_inner_product_PRECISION( &r_tilde, &r, ps->v_start, ps->v_end, l, threading ); if ( rho == 0 ) { START_MASTER(threading) @@ -467,31 +467,31 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr } if ( iter == 1 ) { - vector_PRECISION_copy( p, r, start, end, l ); + vector_PRECISION_copy( &p, &r, start, end, l ); } else { beta = (rho/rho_old)*(alpha/omega); - vector_PRECISION_saxpy( pp, p, v, -omega, start, end, l ); - vector_PRECISION_saxpy( p, r, pp, beta, start, end, l ); + vector_PRECISION_saxpy( &pp, &p, &v, -omega, start, end, l ); + vector_PRECISION_saxpy( &p, &r, &pp , beta, start, end, l ); } - apply_operator_PRECISION( v, p, ps, l, threading ); - alpha = rho / global_inner_product_PRECISION( r_tilde, v, ps->v_start, ps->v_end, l, threading ); - vector_PRECISION_saxpy( s, r, v, -alpha, start, end, l ); - s_norm = global_norm_PRECISION( s, ps->v_start, ps->v_end, l, threading ); + apply_operator_PRECISION( &v, &p, ps, l, threading ); + alpha = rho / global_inner_product_PRECISION( &r_tilde, &v, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( &s, &r, &v, -alpha, start, end, l ); + s_norm = global_norm_PRECISION( &s, ps->v_start, ps->v_end, l, threading ); if ( s_norm/b_norm < tol ) { - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); break; } - apply_operator_PRECISION( t, s, ps, l, threading ); - omega = global_inner_product_PRECISION( t, s, ps->v_start, ps->v_end, l, threading ) - / global_inner_product_PRECISION( t, t, ps->v_start, ps->v_end, l, threading ); + apply_operator_PRECISION( &t, &s, ps, l, threading ); + omega = global_inner_product_PRECISION( &t, &s, ps->v_start, ps->v_end, l, threading ) + / global_inner_product_PRECISION( &t, &t, ps->v_start, ps->v_end, l, threading ); - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( x, x, s, omega, start, end, l ); - vector_PRECISION_saxpy( r, s, t, -omega, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &s, omega, start, end, l ); + vector_PRECISION_saxpy( &r, &s, &t, -omega, start, end, l ); - r_norm = global_norm_PRECISION( r, ps->v_start, ps->v_end, l, threading ); + r_norm = global_norm_PRECISION( &r, ps->v_start, ps->v_end, l, threading ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) START_MASTER(threading) @@ -537,16 +537,15 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads compute_core_start_end(ps->v_start, ps->v_end, &start, &end, l, threading); - vector_PRECISION_define( x, 0, start, end, l ); - apply_operator_PRECISION( Dp, x, ps, l, threading ); - vector_PRECISION_minus( pp, b, Dp, start, end, l ); - apply_operator_dagger_PRECISION( r_old, pp, ps, l, threading ); + vector_PRECISION_define( &x, 0, start, end, l ); + apply_operator_PRECISION( &Dp, &x, ps, l, threading ); + vector_PRECISION_minus( &pp, &b, &Dp, start, end, l ); + apply_operator_dagger_PRECISION( &r_old, &pp, ps, l, threading ); - vector_PRECISION_copy( p, r_old, start, end, l ); - r0_norm = global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_copy( &p, &r_old, start, end, l ); + r0_norm = global_norm_PRECISION( &r_old, ps->v_start, ps->v_end, l, threading ); // prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading ); prod_rr_old = r0_norm*r0_norm; - #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( ps->print ) { START_MASTER(threading) @@ -557,19 +556,19 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * while ( sqrt(prod_rr_old) / r0_norm > tol && iter < maxiter ) { iter++; - apply_operator_PRECISION( pp, p, ps, l, threading ); - apply_operator_dagger_PRECISION( Dp, pp, ps, l, threading ); + apply_operator_PRECISION( &pp, &p, ps, l, threading ); + apply_operator_dagger_PRECISION( &Dp, &pp, ps, l, threading ); - gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &p, &Dp, ps->v_start, ps->v_end, l, threading ); alpha = prod_rr_old / gamma; - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); + vector_PRECISION_saxpy( &r_new, &r_old, &Dp, -alpha, start, end, l ); - gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &r_new, &r_new, ps->v_start, ps->v_end, l, threading ); beta = gamma / prod_rr_old; - vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l ); - vector_PRECISION_copy( r_old, r_new, start, end, l ); + vector_PRECISION_saxpy( &p, &r_new, &p, beta, start, end, l ); + vector_PRECISION_copy( &r_old, &r_new, start, end, l ); prod_rr_old = gamma; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%100 == 0 && ps->print >=1 ) { @@ -580,10 +579,10 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * #endif } - r0_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ); - apply_operator_PRECISION( Dp, x, ps, l, threading ); - vector_PRECISION_minus( r_true, b, Dp, start, end, l ); - r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ); + r0_norm = global_norm_PRECISION( &b, ps->v_start, ps->v_end, l, threading ); + apply_operator_PRECISION( &Dp, &x, ps, l, threading ); + vector_PRECISION_minus( &r_true, &b, &Dp, start, end, l ); + r_norm = global_norm_PRECISION( &r_true, ps->v_start, ps->v_end, l, threading ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( ps->print ) { @@ -598,22 +597,22 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * while ( r_norm / r0_norm > tol && iter < maxiter ) { iter++; - apply_operator_PRECISION( pp, p, ps, l, threading ); - apply_operator_dagger_PRECISION( Dp, pp, ps, l, threading ); + apply_operator_PRECISION( &pp, &p, ps, l, threading ); + apply_operator_dagger_PRECISION( &Dp, &pp, ps, l, threading ); - gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &p, &Dp, ps->v_start, ps->v_end, l, threading ); alpha = prod_rr_old / gamma; - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); + vector_PRECISION_saxpy( &r_new, &r_old, &Dp, -alpha, start, end, l ); // residual update - vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, start, end, l ); - r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ); - gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( &r_true, &r_true, &pp, -alpha, start, end, l ); + r_norm = global_norm_PRECISION( &r_true, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &r_new, &r_new, ps->v_start, ps->v_end, l, threading ); beta = gamma / prod_rr_old; - vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l ); - vector_PRECISION_copy( r_old, r_new, start, end, l ); + vector_PRECISION_saxpy( &p, &r_new, &p, beta, start, end, l ); + vector_PRECISION_copy( &r_old, &r_new, start, end, l ); prod_rr_old = gamma; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%100 == 0 && ps->print >=1 ) { @@ -630,10 +629,10 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * printf0("+----------------------------------------------------------+\n"); printf0("| CGN iterations: %-6d |\n", iter ); END_MASTER(threading) - apply_operator_PRECISION( Dp, x, ps, l, threading ); - vector_PRECISION_minus( pp, b, Dp, start, end, l ); + apply_operator_PRECISION( &Dp, &x, ps, l, threading ); + vector_PRECISION_minus( &pp, &b, &Dp, start, end, l ); - beta = global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading ); + beta = global_norm_PRECISION( &pp, ps->v_start, ps->v_end, l, threading ); START_MASTER(threading) if ( ps->timing ) printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta/r0_norm) ); printf0("| elapsed wall clock time: %-12g seconds |\n", t1-t0 ); @@ -658,15 +657,15 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * } -int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w, +int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Extends the Arnoldi basis by one vector. -* - vector_PRECISION *V: Contains the Arnoldi basis vectors. -* - vector_PRECISION *Z: If a right precond. P is used, contains P*V[j] for all j. -* - vector_PRECISION w: Will be appended to existing Arnoldi basis at +* - vector_PRECISION **V: Contains the Arnoldi basis vectors. +* - vector_PRECISION **Z: If a right precond. P is used, contains P*V[j] for all j. +* - vector_PRECISION *w: Will be appended to existing Arnoldi basis at * position j+1. * - complex_PRECISION **H: Contains full Hessenberg matrix from the Arnoldi * decomposition (columnmajor!) @@ -688,12 +687,12 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); if ( j == 0 ) - vector_PRECISION_copy( Z[0], V[0], start, end, l ); + vector_PRECISION_copy( &Z[0], &V[0], start, end, l ); else - vector_PRECISION_copy( V[j], Z[j], start, end, l ); + vector_PRECISION_copy( &V[j], &Z[j], start, end, l ); complex_PRECISION tmp[j+1]; - process_multi_inner_product_PRECISION( j+1, tmp, V, V[j], p->v_start, p->v_end, l, threading ); + process_multi_inner_product_PRECISION( j+1, tmp, V, &V[j], p->v_start, p->v_end, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _ALLR ); for( i=0; i<=j; i++ ) @@ -708,7 +707,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE PROF_PRECISION_STOP( _ALLR, 1 ); END_MASTER(threading) - apply_operator_PRECISION( Z[j+1], Z[j], p, l, threading ); + apply_operator_PRECISION( &Z[j+1], &Z[j], p, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _ALLR ); @@ -725,8 +724,8 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) for( i=0; i 0 ) { @@ -736,13 +735,13 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) if ( j == 0 ) { - if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[j], -sigma, start, end, l ); } else { for( i=0; ikind == _LEFT ) { - apply_operator_PRECISION( Z[0], V[j], p, l, threading ); - prec( V[j+1], NULL, Z[0], _NO_RES, l, threading ); - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( &V[j+1], NULL, &Z[0], _NO_RES, l, threading ); + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); } else { if ( l->level == 0 ) { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { - prec( Z[j], V[j+1], V[j], _NO_RES, l, threading ); + prec( &Z[j], &V[j+1], &V[j], _NO_RES, l, threading ); // obtains w = D * Z[j] from Schwarz } else { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); // w = D*Z[j] + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); // w = D*Z[j] } } - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); } } else { - apply_operator_PRECISION( V[j+1], V[j], p, l, threading ); // w = D*V[j] - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + apply_operator_PRECISION( &V[j+1], &V[j], p, l, threading ); // w = D*V[j] + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); } complex_PRECISION tmp[j+2]; - process_multi_inner_product_PRECISION( j+2, tmp, V, V[j+1], p->v_start, p->v_end, l, threading ); + process_multi_inner_product_PRECISION( j+2, tmp, V, &V[j+1], p->v_start, p->v_end, l, threading ); START_MASTER(threading) for( i=0; i<=j+1; i++ ) buffer[i] = tmp[i]; @@ -804,8 +803,8 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], start, end, l ); - vector_PRECISION_real_scale( V[j+1], V[j+1], 1/H[j][j+1], start, end, l ); + vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[i], -H[j][i], start, end, l ); + vector_PRECISION_real_scale( &V[j+1], &V[j+1], 1/H[j][j+1], start, end, l ); START_LOCKED_MASTER(threading) H[j][j] += sigma; END_LOCKED_MASTER(threading) @@ -824,23 +823,23 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE if ( prec != NULL ) { if ( p->kind == _LEFT ) { - apply_operator_PRECISION( Z[0], V[j], p, l, threading ); - prec( w, NULL, Z[0], _NO_RES, l, threading ); + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); } else { if ( l->level == 0 ) { - apply_operator_PRECISION( w, Z[j], p, l, threading ); + apply_operator_PRECISION( w, &Z[j], p, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { - prec( Z[j], w, V[j], _NO_RES, l, threading ); + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); // obtains w = D * Z[j] from Schwarz } else { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_PRECISION( w, Z[j], p, l, threading ); // w = D*Z[j] + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( w, &Z[j], p, l, threading ); // w = D*Z[j] } } } } else { - apply_operator_PRECISION( w, V[j], p, l, threading ); // w = D*V[j] + apply_operator_PRECISION( w, &V[j], p, l, threading ); // w = D*V[j] } // orthogonalization @@ -860,7 +859,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( w, w, V[i], -H[j][i], start, end, l ); + vector_PRECISION_saxpy( w, w, &V[i], -H[j][i], start, end, l ); #ifdef REORTH // re-orthogonalization process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start, p->v_end, l, threading ); @@ -879,7 +878,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( w, w, V[i], -tmp[i], start, end, l ); + vector_PRECISION_saxpy( w, w, &V[i], -tmp[i], start, end, l ); #endif // normalization @@ -891,7 +890,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE // V_j+1 = w / H_j+1,j if ( cabs_PRECISION( H[j][j+1] ) > 1e-15 ) - vector_PRECISION_real_scale( V[j+1], w, 1/H[j][j+1], start, end, l ); + vector_PRECISION_real_scale( &V[j+1], w, 1/H[j][j+1], start, end, l ); #endif return 1; } @@ -942,7 +941,7 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, } -void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, complex_PRECISION *y, +void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { @@ -975,18 +974,18 @@ void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, comple // x = x + V*y if ( ol ) { for ( i=0; i<=j; i++ ) { - vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l ); + vector_PRECISION_saxpy( x, x, &V[i], y[i], start, end, l ); } } else { - vector_PRECISION_scale( x, V[0], y[0], start, end, l ); + vector_PRECISION_scale( x, &V[0], y[0], start, end, l ); for ( i=1; i<=j; i++ ) { - vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l ); + vector_PRECISION_saxpy( x, x, &V[i], y[i], start, end, l ); } } } -void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter, +void local_minres_PRECISION( vector_PRECISION *phi, vector_PRECISION *eta, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { /********************************************************************************* @@ -1004,31 +1003,32 @@ void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_ int i, nv = l->num_lattice_site_var, n = l->block_iter, end = (g.odd_even&&l->depth==0)?(start+nv*s->num_block_even_sites):(start+s->block_vector_size); - vector_PRECISION Dr = s->local_minres_buffer[0]; - vector_PRECISION r = s->local_minres_buffer[1]; - vector_PRECISION lphi = s->local_minres_buffer[2]; + vector_PRECISION Dr, r, lphi; + Dr.vector_buffer = s->local_minres_buffer[0]; + r.vector_buffer = s->local_minres_buffer[1]; + lphi.vector_buffer = s->local_minres_buffer[2]; complex_PRECISION alpha; void (*block_op)() = (l->depth==0)?(g.odd_even?apply_block_schur_complement_PRECISION:block_d_plus_clover_PRECISION) :coarse_block_operator_PRECISION; - - vector_PRECISION_copy( r, eta, start, end, l ); - vector_PRECISION_define( lphi, 0, start, end, l ); + + vector_PRECISION_copy( &r, eta, start, end, l ); + vector_PRECISION_define( &lphi, 0, start, end, l ); for ( i=0; i/ - alpha = local_xy_over_xx_PRECISION( Dr, r, start, end, l ); + alpha = local_xy_over_xx_PRECISION( &Dr, &r, start, end, l ); // phi += alpha * r - vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l ); + vector_PRECISION_saxpy( &lphi, &lphi, &r, alpha, start, end, l ); // r -= alpha * Dr - vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l ); + vector_PRECISION_saxpy( &r, &r, &Dr, -alpha, start, end, l ); } - if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, lphi, start, end, l ); - if ( phi != NULL ) vector_PRECISION_plus( phi, phi, lphi, start, end, l ); - vector_PRECISION_copy( eta, r, start, end, l ); - + if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, &lphi, start, end, l ); + if ( phi != NULL ) vector_PRECISION_plus( phi, phi, &lphi, start, end, l ); + vector_PRECISION_copy( eta, &r, start, end, l ); + END_UNTHREADED_FUNCTION(threading) } @@ -1051,15 +1051,15 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { for( ol=0; olnum_restart && finish==0; ol++ ) { if( ol == 0 && p->initial_guess_zero ) { - vector_PRECISION_copy( p->r, p->b, p->v_start, p->v_end, l ); + vector_PRECISION_copy( &(p->r), &(p->b), p->v_start, p->v_end, l ); } else { - apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); // compute w = D*x - vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); // compute r = b - w + apply_operator_PRECISION( &(p->w), &(p->x), p, l, no_threading ); // compute w = D*x + vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), p->v_start, p->v_end, l ); // compute r = b - w } if( ol == 0) { - r0_norm = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ); + r0_norm = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ); } for( il=0; ilrestart_length && finish==0; il++ ) { @@ -1067,20 +1067,20 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { j = il; iter++; p->preconditioner( p->V[j], p->r, _NO_RES, l, no_threading ); - apply_operator_PRECISION( p->Z[j], p->V[j], p, l, no_threading ); + apply_operator_PRECISION( &(p->Z[j]), &(p->V[j]), p, l, no_threading ); for( i=0; iZ[i], p->Z[j], p->v_start, p->v_end, l, no_threading ) / p->gamma[i]; - vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l ); - vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l ); + beta = global_inner_product_PRECISION( &(p->Z[i]), &(p->Z[j]), p->v_start, p->v_end, l, no_threading ) / p->gamma[i]; + vector_PRECISION_saxpy( &(p->V[j]), &(p->V[j]), &(p->V[i]), -beta, p->v_start, p->v_end, l ); + vector_PRECISION_saxpy( &(p->Z[j]), &(p->Z[j]), &(p->Z[i]), -beta, p->v_start, p->v_end, l ); } - p->gamma[j] = global_inner_product_PRECISION( p->Z[j], p->Z[j], p->v_start, p->v_end, l, no_threading ); - alpha = global_inner_product_PRECISION( p->Z[j], p->r, p->v_start, p->v_end, l, no_threading ) / p->gamma[j]; - vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l ); - vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l ); + p->gamma[j] = global_inner_product_PRECISION( &(p->Z[j]), &(p->Z[j]), p->v_start, p->v_end, l, no_threading ); + alpha = global_inner_product_PRECISION( &(p->Z[j]), &(p->r), p->v_start, p->v_end, l, no_threading ) / p->gamma[j]; + vector_PRECISION_saxpy( &(p->x), &(p->x), &(p->V[j]), alpha, p->v_start, p->v_end, l ); + vector_PRECISION_saxpy( &(p->r), &(p->r), &(p->Z[j]), -alpha, p->v_start, p->v_end, l ); - alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / r0_norm; + alpha = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ) / r0_norm; if ( creal(alpha) < p->tol ) { finish = 1; break; @@ -1094,9 +1094,9 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { if ( p->timing || p->print ) t1 = MPI_Wtime(); if ( p->print ) { - apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); - vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); - beta = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ); + apply_operator_PRECISION( &(p->w), &(p->x), p, l, no_threading ); + vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), p->v_start, p->v_end, l ); + beta = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) printf0("+----------------------------------------------------------+\n"); printf0("\n"); diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h index 8a1f2e8..1acde04 100644 --- a/src/linsolve_generic.h +++ b/src/linsolve_generic.h @@ -33,15 +33,15 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ); void cgn_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *threading ); - void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter, + void local_minres_PRECISION( vector_PRECISION *phi, vector_PRECISION *eta, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w, + int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, complex_PRECISION *c, complex_PRECISION *gamma, int j, level_struct *l, struct Thread *threading ); - void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, + void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); #endif diff --git a/src/main.h b/src/main.h index cf15fde..afdd27b 100644 --- a/src/main.h +++ b/src/main.h @@ -543,6 +543,8 @@ #include "var_table.h" #include "main_post_def_float.h" #include "main_post_def_double.h" +#include "vector_float.h" +#include "vector_double.h" #ifdef HAVE_LIME #include #include diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h index 690ef6b..4817c43 100644 --- a/src/main_post_def_generic.h +++ b/src/main_post_def_generic.h @@ -26,35 +26,35 @@ #include "dirac_PRECISION.h" #include "coarse_operator_PRECISION.h" - static inline void apply_operator_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + static inline void apply_operator_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { p->eval_operator( output, input, p->op, l, threading ); } - static inline void apply_operator_dagger_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + static inline void apply_operator_dagger_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { - tau1_gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading ); + tau1_gamma5_PRECISION( &(l->vbuf_PRECISION[6]), input, l, threading ); } else #endif { - gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading ); + gamma5_PRECISION( &(l->vbuf_PRECISION[6]), input, l, threading ); #ifdef HAVE_TM //TODO: change_mu_sign_PRECISION( p->op, l, threading ); #endif } - apply_operator_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], p, l, threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[7]), &(l->vbuf_PRECISION[6]), p, l, threading ); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { - tau1_gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading ); + tau1_gamma5_PRECISION( output,&(l->vbuf_PRECISION[7]), l, threading ); } else #endif { - gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading ); + gamma5_PRECISION( output, &(l->vbuf_PRECISION[7]), l, threading ); #ifdef HAVE_TM //TODO: change_mu_sign_PRECISION( p->op, l, threading ); #endif diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index d485518..d33e64e 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -24,7 +24,11 @@ typedef PRECISION complex complex_PRECISION; typedef PRECISION complex *config_PRECISION; - typedef PRECISION complex *vector_PRECISION; + typedef PRECISION complex *buffer_PRECISION; + + typedef struct { + buffer_PRECISION vector_buffer; + } vector_PRECISION; typedef struct { int length[8], *boundary_table[8], max_length[4], @@ -52,7 +56,8 @@ *index_table, *neighbor_table, *translation_table, table_dim[4], *backward_neighbor_table, table_mod_dim[4], *config_boundary_table[4]; - vector_PRECISION *buffer, prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX; + vector_PRECISION *buffer; + buffer_PRECISION prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX; comm_PRECISION_struct c; OPERATOR_TYPE_PRECISION *D_vectorized; OPERATOR_TYPE_PRECISION *D_transformed_vectorized; @@ -87,7 +92,7 @@ operator_PRECISION_struct op; vector_PRECISION buf1, buf2, buf3, buf4, buf5; vector_PRECISION oe_buf[4]; - vector_PRECISION local_minres_buffer[3]; + buffer_PRECISION local_minres_buffer[3]; int block_oe_offset, *index[4], dir_length[4], num_blocks, num_colors, dir_length_even[4], dir_length_odd[4], *oe_index[4], num_block_even_sites, num_block_odd_sites, num_aggregates, diff --git a/src/oddeven_generic.c b/src/oddeven_generic.c index 9da8cce..f49c552 100644 --- a/src/oddeven_generic.c +++ b/src/oddeven_generic.c @@ -191,203 +191,245 @@ void selfcoupling_LU_doublet_decomposition_PRECISION( const config_PRECISION out #endif -static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) { +static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION L, + int start, int end ) { /********************************************************************************* * Solves L*(L^H)*x = b for x, i.e., the clover coupling for a single lattice * site. -* - vector_PRECISION b: Right hand side. -* - vector_PRECISION x: Solution. +* - vector_PRECISION *b: Right hand side. +* - vector_PRECISION *x: Solution. * - config_PRECISION L: Cholesky factor ( lower triangular matrix ) *********************************************************************************/ - register int i, j; + register int id, i, j; int n; + buffer_PRECISION x_pt = x->vector_buffer, b_pt = b->vector_buffer; + x_pt += start; b_pt += start; - for ( n=0; n<2; n++ ) { - // forward substitution with L - for ( i=0; i<6; i++ ) { - x[i] = b[i]; - for ( j=0; j=0; i-- ) { - for ( j=i+1; j<6; j++ ) { - x[i] = x[i] - conj_PRECISION(L[(j*(j+1))/2 + i]) * x[j]; + L -= 21; + // backward substitution with L^H + for ( i=5; i>=0; i-- ) { + for ( j=i+1; j<6; j++ ) { + x_pt[i] = x_pt[i] - conj_PRECISION(L[(j*(j+1))/2 + i]) * x_pt[j]; + } + x_pt[i] = x_pt[i] / conj_PRECISION(L[(i*(i+1))/2 + i]); } - x[i] = x[i] / conj_PRECISION(L[(i*(i+1))/2 + i]); + x_pt+=6; + b_pt+=6; + L+=21; } - x+=6; - b+=6; - L+=21; + x_pt+=12; b_pt+=12; L+=42; } } -static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION LU ) { + +static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION LU, + int start, int end ) { /********************************************************************************* * Solves L*U*x = b for x, i.e., the clover coupling for a single lattice * site. -* - vector_PRECISION b: Right hand side. -* - vector_PRECISION x: Solution. +* - vector_PRECISION *b: Right hand side. +* - vector_PRECISION *x: Solution. * - config_PRECISION L: Lower matrix from modified LU decomposition * Note: U is given by u_{ii}=1, u_{ij}=l_{ji}* / l_{ii} *********************************************************************************/ - register int i, j, n; + register int id, i, j, n; + buffer_PRECISION x_pt = x->vector_buffer, b_pt = b->vector_buffer; + x_pt += start; b_pt += start; #ifdef HAVE_TM1p1 - if( g.n_flavours == 2) - for ( n=0; n<2; n++ ) { - // solve x = U^(-1) L^(-1) b - // forward substitution with L - for ( i=0; i<12; i++ ) { - x[i] = b[i]; - for ( j=0; j=0; i-- ) { - for ( j=i+1; j<12; j++ ) { - x[i] = x[i] - LU[i*12+j]*x[j]; + if( g.n_flavours == 2) { + LU += (start/24)*288; + for ( id=start; id=0; i-- ) { + for ( j=i+1; j<12; j++ ) { + x_pt[i] = x_pt[i] - LU[i*12+j]*x_pt[j]; + } + x_pt[i] = x_pt[i]/LU[i*(12+1)]; } + x_pt+=12; + b_pt+=12; + LU+=12*12; } - // backward substitution with U - for ( i=6-1; i>=0; i-- ) { - for ( j=i+1; j<6; j++ ) { - x[i] = x[i] - LU[i*6+j]*x[j]; + x_pt+=24; b_pt+=24; LU+=288; + } + } else +#endif + { + LU += (start/12)*72; + for ( id=start; id=0; i-- ) { + for ( j=i+1; j<6; j++ ) { + x_pt[i] = x_pt[i] - LU[i*6+j]*x_pt[j]; + } + x_pt[i] = x_pt[i]/LU[i*(6+1)]; + } + x_pt+=6; + b_pt+=6; + LU+=6*6; } - x[i] = x[i]/LU[i*(6+1)]; } - x+=6; - b+=6; - LU+=6*6; + x_pt+=12; b_pt+=12; LU+=72; } } - -static inline void LLH_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION L ) { +static inline void LLH_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION L, + int start, int end ) { /********************************************************************************* * Applies the clover coupling term to a vector, by multiplying L^H * and then L. -* - vector_PRECISION x: Input vector. -* - vector_PRECISION y: Output vector. +* - vector_PRECISION *x: Input vector. +* - vector_PRECISION *y: Output vector. * - config_PRECISION L: Cholesky factor ( lower triangular matrix ) *********************************************************************************/ - register int i, j; + register int id, i, j; int n; complex_PRECISION z[6]; + buffer_PRECISION x_pt = x->vector_buffer, y_pt = y->vector_buffer; + x_pt += start; y_pt += start; - for ( n=0; n<2; n++ ) { - // z = L^H x - for ( j=0; j<6; j++ ) { // columns - for ( i=0; ivector_buffer, y_pt = y->vector_buffer; + x_pt += start; y_pt += start; #ifdef HAVE_TM1p1 - if( g.n_flavours == 2) - for ( n=0; n<2; n++ ) { - for ( i=0; i<12; i++ ) { - y[i] = LU[i*(12+1)]*x[i]; - for ( j=i+1; j<12; j++ ) - y[i] += LU[i*12+j]*x[j]; + if( g.n_flavours == 2) { + LU += (start/24)*288; + for ( id=start; id0; i-- ) + for ( j=0; j0; i-- ) - for ( j=0; j0; i-- ) + for ( j=0; j0; i-- ) - for ( j=0; jvector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_doublet_vectorized + (start/24)*288; - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; + PRECISION *x_pt = (PRECISION*)x->vector_buffer; + PRECISION *y_pt = (PRECISION*)y->vector_buffer; for ( int i=start; iepsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) ) apply_doublet_coupling_PRECISION( x, y, epsbar_term, end-start ); #else - config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288; - // diagonal blocks applied to the even sites - for ( int i=start; iclover_doublet_oo_inv, start, end); #endif } else { #endif - x += start; y += start; if ( g.csw ) { #ifdef OPTIMIZED_SELF_COUPLING_PRECISION + x->vector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_vectorized + (start/12)*144; - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; + PRECISION *x_pt = (PRECISION*)x->vector_buffer; + PRECISION *y_pt = (PRECISION*)y->vector_buffer; for ( int i=start; iclover + (start/12)*72; - // diagonal blocks applied to the even sites - for ( int i=start; iclover, start, end); #else - config_PRECISION sc = op->clover + (start/12)*42; - // diagonal blocks applied to the even sites - for ( int i=start; iclover, start, end ); #endif } else { config_PRECISION sc = op->clover + start; - for ( int i=start; ivector_buffer[i] = x->vector_buffer[i]*sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -443,38 +470,29 @@ void diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI } // for debugging only -void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void diag_ee_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { - int i, n1 = op->num_even_sites; - config_PRECISION sc = op->clover_doublet_oo_inv; - // diagonal blocks applied to the even sites - for ( i=0; inum_even_sites; + LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover_doublet_oo_inv, 0, n1*24); } else { #endif int i, n1 = op->num_even_sites; - config_PRECISION sc = op->clover; if ( g.csw ) { // diagonal blocks applied to the even sites - for ( i=0; iclover, 0, n1*12 ); #else - LU_perform_fwd_bwd_subs_PRECISION( y, x, sc ); - y+=12; x+=12; sc+=72; + LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, 0, n1*12 ); #endif - } } else { - for ( i=0; iclover; + for ( i=0; ivector_buffer[i] = x->vector_buffer[i]/sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -484,51 +502,35 @@ void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRE } // for debugging only -void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Applies the odd-odd block of the odd even decomposition to a vector. -* - vector_PRECISION x: Input vector. -* - vector_PRECISION y: Output vector. +* - vector_PRECISION *x: Input vector. +* - vector_PRECISION *y: Output vector. *********************************************************************************/ START_UNTHREADED_FUNCTION(threading) #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { - int i, n1 = op->num_even_sites, n2 = op->num_odd_sites; - config_PRECISION sc = op->clover_doublet_oo_inv + n1*288; - x += n1*24; y += n1*24; - // diagonal blocks applied to the even sites - for ( i=0; inum_even_sites, n2 = op->num_odd_sites; + LU_multiply_PRECISION( y, x, op->clover_doublet_oo_inv, n1*24, (n1+n2)*24 ); } else { #endif int i, n1 = op->num_even_sites, n2 = op->num_odd_sites; - config_PRECISION sc = op->clover; - x += n1*12; y += n1*12; // diagonal blocks applied to the odd sites if ( g.csw ) { #ifndef HAVE_TM - sc += n1*42; - for ( i=0; iclover, n1*12, (n1+n2)*12 ); #else - sc += n1*72; - for ( i=0; iclover, n1*12, (n1+n2)*12 ); #endif } else { - sc += n1*12; - for ( i=0; iclover + n1*12; + for ( i=n1*12; i<(n1+n2)*12; ) { + FOR12( y->vector_buffer[i] = x->vector_buffer[i]*sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -539,59 +541,46 @@ void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI } -void diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, int start, int end ) { #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { - x += start; y += start; // inverse diagonal blocks applied to the odd sites #ifdef OPTIMIZED_SELF_COUPLING_PRECISION + x->vector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288; - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; + PRECISION *x_pt = (PRECISION*)x->vector_buffer; + PRECISION *y_pt = (PRECISION*)y->vector_buffer; for ( int i=start; iclover_doublet_oo_inv + (start/24)*288; - for ( int i=start; iclover_doublet_oo_inv, start, end ); #endif } else { #endif - config_PRECISION sc = op->clover; - x += start; y += start; // inverse diagonal blocks applied to the odd sites if ( g.csw ) { #ifdef OPTIMIZED_SELF_COUPLING_PRECISION + x->vector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; + PRECISION *x_pt = (PRECISION*)x->vector_buffer; + PRECISION *y_pt = (PRECISION*)y->vector_buffer; for ( int i=start; iclover, start, end ); #else - sc += (start/12)*42; - for ( int i=start; iclover, start, end ); #endif } else { - sc += start; - for ( int i=start; iclover + start; + for ( int i=start; ivector_buffer[i] = x->vector_buffer[i]/sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -922,14 +911,14 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { MALLOC( op->prnT, complex_PRECISION, j*8 ); op->prnZ = op->prnT + j; op->prnY = op->prnZ + j; op->prnX = op->prnY + j; op->prpT = op->prnX + j; op->prpZ = op->prpT + j; op->prpY = op->prpZ + j; op->prpX = op->prpY + j; - MALLOC( op->buffer, complex_PRECISION*, 2 ); - op->buffer[0] = NULL; + MALLOC( op->buffer, vector_PRECISION, 2 ); + vector_PRECISION_init(&(op->buffer[0])); #ifdef HAVE_TM1p1 - MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size ); - op->buffer[1] = op->buffer[0] + 2*l->vector_size; + MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 4*l->vector_size ); + op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + 2*l->vector_size; #else - MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); - op->buffer[1] = op->buffer[0] + l->vector_size; + MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 2*l->vector_size ); + op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + l->vector_size; #endif ghost_alloc_PRECISION( 0, &(op->c), l ); ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ; @@ -979,11 +968,11 @@ void oddeven_free_PRECISION( level_struct *l ) { } #ifdef HAVE_TM1p1 - FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 4*l->vector_size ); + FREE( l->oe_op_PRECISION.buffer[0].vector_buffer, complex_PRECISION, 4*l->vector_size ); #else - FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size ); + FREE( l->oe_op_PRECISION.buffer[0].vector_buffer, complex_PRECISION, 2*l->vector_size ); #endif - FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 ); + FREE( l->oe_op_PRECISION.buffer, vector_PRECISION, 2 ); #ifdef HAVE_TM1p1 FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, 2*(l->num_lattice_site_var/2)*l->num_lattice_sites*8 ); FREE( l->oe_op_PRECISION.clover_doublet_oo_inv, complex_PRECISION, 288*n ); @@ -993,7 +982,7 @@ void oddeven_free_PRECISION( level_struct *l ) { } -void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void oddeven_to_serial_PRECISION( vector_double *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Translates a vector from an odd even PRECISION precision layout to a serial @@ -1011,7 +1000,7 @@ void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_ for ( i=start; ivector_buffer[i*nsv+j] = (complex_double) in->vector_buffer[k*nsv+j]; } } END_NO_HYPERTHREADS(threading) @@ -1019,7 +1008,7 @@ void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_ } -void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_struct *l, struct Thread *threading ) { +void serial_to_oddeven_PRECISION( vector_PRECISION *out, vector_double *in, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Translates a vector from a serial double precision layout to an odd even @@ -1037,7 +1026,7 @@ void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_ for ( i=start; ivector_buffer[k*nsv+j] = (complex_PRECISION) in->vector_buffer[i*nsv+j]; } } END_NO_HYPERTHREADS(threading) @@ -1045,7 +1034,7 @@ void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_ } -void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void oddeven_to_block_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { int i, j, k, m, nsv = l->num_lattice_site_var, *tt_oe = l->oe_op_PRECISION.translation_table, @@ -1059,7 +1048,7 @@ void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, leve for ( i=start; ivector_buffer[m*nsv+j] = in->vector_buffer[k*nsv+j]; } } END_NO_HYPERTHREADS(threading) @@ -1067,7 +1056,7 @@ void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, leve } -void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void block_to_oddeven_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { int i, j, k, m, nsv = l->num_lattice_site_var, *tt_oe = l->oe_op_PRECISION.translation_table, @@ -1081,14 +1070,14 @@ void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, leve for ( i=start; ivector_buffer[k*nsv+j] = in->vector_buffer[m*nsv+j]; } } END_NO_HYPERTHREADS(threading) SYNC_CORES(threading) } -void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites, @@ -1120,7 +1109,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; #else int i, *nb_pt; - vector_PRECISION phi_pt, eta_pt, end_pt; + buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; #endif @@ -1128,10 +1117,10 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato if( g.n_flavours == 2 ) { // project in negative directions #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi, 24*start, 24*n ); + dprp_PRECISION( prn, phi->vector_buffer, 24*start, 24*n ); #else complex_PRECISION pbuf[12]; - for ( i=12*start, phi_pt=phi+24*start; i<12*n; i+=12, phi_pt+=24 ) { + for ( i=12*start, phi_pt=phi->vector_buffer+24*start; i<12*n; i+=12, phi_pt+=24 ) { dprp_T_PRECISION( op->prnT+i, phi_pt ); dprp_Z_PRECISION( op->prnZ+i, phi_pt ); dprp_Y_PRECISION( op->prnY+i, phi_pt ); @@ -1147,9 +1136,9 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi, op, neighbor, 24*start, 24*n ); + dprn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, 24*start, 24*n ); #else - for ( phi_pt=phi+24*start, end_pt=phi+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptvector_buffer+24*start, end_pt=phi->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptvector_buffer, prn, op, neighbor, 24*start, 24*n ); #else - for ( eta_pt=eta+24*start, end_pt=eta+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptvector_buffer+24*start, end_pt=eta->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); @@ -1241,9 +1230,9 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato END_LOCKED_MASTER(threading) // lift up plus dir #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta, prp, 24*start, 24*n ); + dpbn_PRECISION( eta->vector_buffer, prp, 24*start, 24*n ); #else - for ( i=12*start, eta_pt=eta+24*start; i<12*n; i+=12, eta_pt+=24 ) { + for ( i=12*start, eta_pt=eta->vector_buffer+24*start; i<12*n; i+=12, eta_pt+=24 ) { dpbn_su3_T_PRECISION( op->prpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); @@ -1254,10 +1243,10 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato #endif // project in negative directions #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi, 12*start, 12*n ); + prp_PRECISION( prn, phi->vector_buffer, 12*start, 12*n ); #else complex_PRECISION pbuf[6]; - for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) { + for ( i=6*start, phi_pt=phi->vector_buffer+12*start; i<6*n; i+=6, phi_pt+=12 ) { prp_T_PRECISION( op->prnT+i, phi_pt ); prp_Z_PRECISION( op->prnZ+i, phi_pt ); prp_Y_PRECISION( op->prnY+i, phi_pt ); @@ -1273,9 +1262,9 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n ); + prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, 12*start, 12*n ); #else - for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptvector_buffer+12*start, end_pt=phi->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptvector_buffer, prn, op, neighbor, 12*start, 12*n ); #else - for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptvector_buffer+12*start, end_pt=eta->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); @@ -1351,9 +1340,9 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato END_LOCKED_MASTER(threading) // lift up plus dir #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta, prp, 12*start, 12*n ); + pbn_PRECISION( eta->vector_buffer, prp, 12*start, 12*n ); #else - for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) { + for ( i=6*start, eta_pt=eta->vector_buffer+12*start; i<6*n; i+=6, eta_pt+=12 ) { pbn_su3_T_PRECISION( op->prpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); @@ -1367,7 +1356,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato SYNC_CORES(threading) } -void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { /********************************************************************************* @@ -1383,8 +1372,8 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in vector_PRECISION *tmp = op->buffer; SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); - vector_PRECISION_define( tmp[0], 0, start_even, end_even, l ); + vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp[0], 0, start_even, end_even, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); @@ -1392,17 +1381,17 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in diag_ee_PRECISION( out, in, op, l, start_even, end_even ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 1, threading ); - hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, start_odd, end_odd ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); - vector_PRECISION_minus( out, out, tmp[0], start_even, end_even, l ); + vector_PRECISION_minus( out, out, &tmp[0], start_even, end_even, l ); } @@ -1417,80 +1406,68 @@ void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_stru // odd to even PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp, p->b, op, l, start, end ); + diag_oo_inv_PRECISION( &tmp, &(p->b), op, l, start, end ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) - vector_PRECISION_scale( tmp, tmp, -1, start, end, l ); + vector_PRECISION_scale( &tmp, &tmp, -1, start, end, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( p->b, tmp, op, _EVEN_SITES, l, threading ); + hopping_term_PRECISION( &(p->b), &tmp, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); if ( g.method == 4 ) fgmres_PRECISION( p, l, threading ); else if ( g.method == 5 ) bicgstab_PRECISION( p, l, threading ); - diag_oo_inv_PRECISION( p->x, p->b, op, l, start, end ); + diag_oo_inv_PRECISION( &(p->x), &(p->b), op, l, start, end ); // even to odd SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start, end, l ); + vector_PRECISION_define( &tmp, 0, start, end, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp, &(p->x), op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( p->b, tmp, op, l, start, end ); + diag_oo_inv_PRECISION( &(p->b), &tmp, op, l, start, end ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) - vector_PRECISION_minus( p->x, p->x, p->b, start, end, l ); + vector_PRECISION_minus( &(p->x), &(p->x), &(p->b), start, end, l ); SYNC_CORES(threading) } -void g5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { - if ( eta != phi ) { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - FOR6( *eta = -(*phi); phi++; eta++; ) - FOR6( *eta = (*phi); phi++; eta++; ) +void g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { + if ( eta->vector_buffer != phi->vector_buffer ) { + for( int i = start; i < end; ) { + FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) + FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; ) } } else { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - FOR6( *eta = -(*phi); phi++; eta++; ) - eta+=6; phi+=6; + for ( int i = start; i < end; ) { + FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; ) + eta->vector_buffer+=6; phi->vector_buffer+=6; } } } -void minus_g5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { - if ( eta != phi ) { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - FOR6( *eta = (*phi); phi++; eta++; ) - FOR6( *eta = -(*phi); phi++; eta++; ) +void minus_g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { + if ( eta->vector_buffer != phi->vector_buffer ) { + for ( int i = start; i < end; ) { + FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; ) + FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) } } else { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - eta+=6; phi+=6; - FOR6( *eta = -(*phi); phi++; eta++; ) + for ( int i = start; i < end; ) { + eta->vector_buffer+=6; phi->vector_buffer+=6; + FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) } } } -void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void g5D_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Applies the Schur complement to a vector. @@ -1502,10 +1479,14 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var ); vector_PRECISION *tmp = op->buffer; + + // vector_PRECISION **tmp; + // *tmp->vector_buffer = op->buffer->vector_buffer; + SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); - vector_PRECISION_define( tmp[0], 0, start_even, end_even, l ); + vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp[0], 0, start_even, end_even, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); @@ -1513,17 +1494,17 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO diag_ee_PRECISION( out, in, op, l, start_even, end_even ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 1, threading ); - hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, start_odd, end_odd ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); - vector_PRECISION_minus( out, out, tmp[0], start_even, end_even, l ); + vector_PRECISION_minus( out, out, &tmp[0], start_even, end_even, l ); SYNC_CORES(threading) g5_PRECISION( out, out, start_even, end_even, l ); // g5_PRECISION( out, out, start_odd, end_odd, l ); @@ -1541,40 +1522,40 @@ void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_ // odd to even PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp, p->b, op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &tmp, &(p->b), op, l, start_odd, end_odd ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) // g5_PRECISION( tmp, tmp, start_odd, end_odd, l ); // vector_PRECISION_scale( tmp, tmp, -1, start_odd, end_odd, l ); - minus_g5_PRECISION( tmp, tmp, start_odd, end_odd, l ); + minus_g5_PRECISION( &tmp, &tmp, start_odd, end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - vector_PRECISION_define( p->x, 0, start_even, end_even, l ); - hopping_term_PRECISION( p->x, tmp, op, _EVEN_SITES, l, threading ); + vector_PRECISION_define( &(p->x), 0, start_even, end_even, l ); + hopping_term_PRECISION( &(p->x), &tmp, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); SYNC_CORES(threading) - g5_PRECISION( p->x, p->x, start_even, end_even, l ); - vector_PRECISION_plus( p->b, p->b, p->x, start_even, end_even, l ); + g5_PRECISION( &(p->x), &(p->x), start_even, end_even, l ); + vector_PRECISION_plus( &(p->b), &(p->b), &(p->x), start_even, end_even, l ); SYNC_CORES(threading) ASSERT( g.method == 6 ); fgmres_PRECISION( p, l, threading ); - diag_oo_inv_PRECISION( p->x, p->b, op, l, start_odd, end_odd ); - g5_PRECISION( p->x, p->x, start_odd, end_odd, l ); + diag_oo_inv_PRECISION( &(p->x), &(p->b), op, l, start_odd, end_odd ); + g5_PRECISION( &(p->x), &(p->x), start_odd, end_odd, l ); // even to odd SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp, 0, start_odd, end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp, &(p->x), op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( p->b, tmp, op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &(p->b), &tmp, op, l, start_odd, end_odd ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) - vector_PRECISION_minus( p->x, p->x, p->b, start_odd, end_odd, l ); + vector_PRECISION_minus( &(p->x), &(p->x), &(p->b), start_odd, end_odd, l ); SYNC_CORES(threading) } @@ -1756,7 +1737,7 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct #endif } -void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_diag_ee_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1767,7 +1748,7 @@ void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } // diagonal blocks applied to the odd sites of a block -void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1785,39 +1766,31 @@ void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, if ( g.n_flavours == 2 ) { int block_num = start/24/(n1+n2); // config_PRECISION clover = s->op.clover_doublet_oo_inv+n1*288+(start/24)*288; - config_PRECISION clover = s->op.clover_doublet_oo_inv+(start/24-block_num*n1)*288; - vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start; - for ( i=0; iop.clover_doublet_oo_inv-(block_num+1)*n1*288; + LU_multiply_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start ); } else { #endif - vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start; if ( g.csw ) { int block_num = start/12/(n1+n2); #ifndef HAVE_TM - config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*42; + LLH_multiply_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #else - config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*72; + LU_multiply_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #endif } else { + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+n1*12+start; + leta.vector_buffer = eta->vector_buffer+n1*12+start; config_PRECISION clover = s->op.clover+n1*12+start; #ifndef HAVE_TM for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]*(clover[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]*(clover[i]); #else config_PRECISION tm_term = s->op.tm_term+n1*12+start; for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]*(clover[i]+tm_term[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]*(clover[i]+tm_term[i]); #endif } #ifdef HAVE_TM1p1 @@ -1829,7 +1802,7 @@ void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } // inverted diagonal blocks applied to the odd sites of a block -void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, +void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1839,58 +1812,56 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, in #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start; int block_num = start/24/(n1+n2); #ifndef OPTIMIZED_SELF_COUPLING_PRECISION - config_PRECISION clover = s->op.clover_doublet_oo_inv + (start/24-block_num*n1)*288; - for ( i=0; iop.clover_doublet_oo_inv-(block_num+1)*n1*288; + LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start ); #else PRECISION *clover_vectorized = s->op.clover_doublet_oo_inv_vectorized + (start/24-block_num*n1)*2*288; + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+n1*24+start; + leta.vector_buffer = eta->vector_buffer+n1*24+start; for ( i=0; iop.clover_oo_inv+(start/12-block_num*n1)*42; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*42; + LLH_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #else - config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*72; + LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #endif #else PRECISION *clover_vectorized = s->op.clover_oo_inv_vectorized + (start/12-block_num*n1)*144; + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+n1*12+start; + leta.vector_buffer = eta->vector_buffer+n1*12+start; for ( i=0; iop.clover+n1*12+start; + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+n1*12+start; + leta.vector_buffer = eta->vector_buffer+n1*12+start; #ifndef HAVE_TM for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]/(clover[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]/(clover[i]); #else config_PRECISION tm_term = s->op.tm_term+n1*12+start; for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]/(clover[i]+tm_term[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]/(clover[i]+tm_term[i]); #endif } #ifdef HAVE_TM1p1 @@ -1901,7 +1872,7 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, in } -void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1924,15 +1895,15 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, a1 = 0; n1 = length_even[mu]+length_odd[mu]; a2 = 0; n2 = n1; } - block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor ); + block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dplus, (PRECISION*)(phi->vector_buffer+start), mu, a1, n1, index[mu], neighbor ); + block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dminus, (PRECISION*)(phi->vector_buffer+start), mu, a2, n2, index[mu], neighbor ); } #else config_PRECISION D = s->op.D + (start/nv)*36; int i, j, k, *ind; config_PRECISION D_pt; - vector_PRECISION lphi = phi+start, leta = eta+start; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { @@ -2194,7 +2165,7 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } -void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -2217,13 +2188,13 @@ void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, a1 = 0; n1 = length_even[mu]+length_odd[mu]; a2 = 0; n2 = n1; } - block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor ); - block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor ); + block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dplus, (PRECISION*)(phi->vector_buffer+start), mu, a1, n1, index[mu], neighbor ); + block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dminus, (PRECISION*)(phi->vector_buffer+start), mu, a2, n2, index[mu], neighbor ); } #else int i, j, k, *ind; - vector_PRECISION lphi = phi+start, leta = eta+start; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; config_PRECISION D_pt, D = s->op.D + (start/nv)*36; #ifdef HAVE_TM1p1 @@ -2487,22 +2458,22 @@ void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } -void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start, +void apply_block_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { vector_PRECISION *tmp = s->oe_buf; block_diag_ee_PRECISION( out, in, start, s, l, threading ); START_LOCKED_MASTER(threading) - vector_PRECISION_define( tmp[0], 0, start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l ); + vector_PRECISION_define( &tmp[0], 0, start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l ); END_LOCKED_MASTER(threading) - block_hopping_term_PRECISION( tmp[0], in, start, _ODD_SITES, s, l, threading ); - block_diag_oo_inv_PRECISION( tmp[1], tmp[0], start, s, l, threading ); - block_n_hopping_term_PRECISION( out, tmp[1], start, _EVEN_SITES, s, l, threading ); + block_hopping_term_PRECISION( &tmp[0], in, start, _ODD_SITES, s, l, threading ); + block_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], start, s, l, threading ); + block_n_hopping_term_PRECISION( out, &tmp[1], start, _EVEN_SITES, s, l, threading ); } -void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, vector_PRECISION latest_iter, +void block_solve_oddeven_PRECISION( vector_PRECISION *phi, vector_PRECISION *r, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -2511,21 +2482,19 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve int end = start+s->block_vector_size; // odd to even - vector_PRECISION_copy( tmp[3], r, start, end, l ); - block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading ); - block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _EVEN_SITES, s, l, no_threading ); - - local_minres_PRECISION( NULL, tmp[3], tmp[2], start, s, l, no_threading ); - + vector_PRECISION_copy( &tmp[3], r, start, end, l ); + block_diag_oo_inv_PRECISION( &tmp[2], &tmp[3], start, s, l, no_threading ); + block_n_hopping_term_PRECISION( &tmp[3], &tmp[2], start, _EVEN_SITES, s, l, no_threading ); + local_minres_PRECISION( NULL, &tmp[3], &tmp[2], start, s, l, no_threading ); // even to odd - block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _ODD_SITES, s, l, no_threading ); - block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading ); + block_n_hopping_term_PRECISION( &tmp[3], &tmp[2], start, _ODD_SITES, s, l, no_threading ); + block_diag_oo_inv_PRECISION( &tmp[2], &tmp[3], start, s, l, no_threading ); // update phi, latest_iter - vector_PRECISION_copy( latest_iter, tmp[2], start, end, l ); - vector_PRECISION_plus( phi, phi, tmp[2], start, end, l ); + vector_PRECISION_copy( latest_iter, &tmp[2], start, end, l ); + vector_PRECISION_plus( phi, phi, &tmp[2], start, end, l ); // update r - vector_PRECISION_copy( r, tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l ); + vector_PRECISION_copy( r, &tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l ); vector_PRECISION_define( r, 0, start+l->num_lattice_site_var*s->num_block_even_sites, end, l ); END_UNTHREADED_FUNCTION(threading) @@ -2537,55 +2506,61 @@ void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) { schwarz_PRECISION_struct *s = &(l->s_PRECISION); - vector_PRECISION b1 = NULL, b2 = NULL, b3 = NULL, b4 = NULL, b5 = NULL; + vector_PRECISION b1, b2, b3, b4, b5; PRECISION diff; + + vector_PRECISION_init(&b1); + vector_PRECISION_init(&b2); + vector_PRECISION_init(&b3); + vector_PRECISION_init(&b4); + vector_PRECISION_init(&b5); int vs = s->block_vector_size * s->num_blocks; - MALLOC( b1, complex_PRECISION, vs ); - MALLOC( b2, complex_PRECISION, vs ); - MALLOC( b3, complex_PRECISION, vs ); - MALLOC( b4, complex_PRECISION, vs ); - MALLOC( b5, complex_PRECISION, vs ); + MALLOC( b1.vector_buffer, complex_PRECISION, vs ); + MALLOC( b2.vector_buffer, complex_PRECISION, vs ); + MALLOC( b3.vector_buffer, complex_PRECISION, vs ); + MALLOC( b4.vector_buffer, complex_PRECISION, vs ); + MALLOC( b5.vector_buffer, complex_PRECISION, vs ); - vector_PRECISION_define_random( b1, 0, vs, l ); + vector_PRECISION_define_random( &b1, 0, vs, l ); for (int i = 0; i< s->num_blocks; i++ ) { - block_diag_ee_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); - block_diag_oo_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); - block_hopping_term_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading ); + block_diag_ee_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_diag_oo_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_hopping_term_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading ); - block_d_plus_clover_PRECISION( b3, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_d_plus_clover_PRECISION( &b3, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); } - vector_PRECISION_minus( b3, b3, b2, 0, vs, l ); - diff = global_norm_PRECISION( b3, 0, vs, l, no_threading ) / global_norm_PRECISION( b2, 0, vs, l, no_threading ); + vector_PRECISION_minus( &b3, &b3, &b2, 0, vs, l ); + diff = global_norm_PRECISION( &b3, 0, vs, l, no_threading ) / global_norm_PRECISION( &b2, 0, vs, l, no_threading ); test0_PRECISION("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff ); - vector_PRECISION_copy( b4, b1, 0, s->block_vector_size, l ); - vector_PRECISION_define( b3, 0, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); + vector_PRECISION_copy( &b4, &b1, 0, s->block_vector_size, l ); + vector_PRECISION_define( &b3, 0, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); - block_hopping_term_PRECISION( b3, b4, 0, _ODD_SITES, s, l, no_threading ); - block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading ); - vector_PRECISION_plus( b4, b4, b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); + block_hopping_term_PRECISION( &b3, &b4, 0, _ODD_SITES, s, l, no_threading ); + block_diag_oo_inv_PRECISION( &b5, &b3, 0, s, l, no_threading ); + vector_PRECISION_plus( &b4, &b4, &b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); - apply_block_schur_complement_PRECISION( b3, b4, 0, s, l, no_threading ); - block_diag_oo_PRECISION( b3, b4, 0, s, l, no_threading ); + apply_block_schur_complement_PRECISION( &b3, &b4, 0, s, l, no_threading ); + block_diag_oo_PRECISION( &b3, &b4, 0, s, l, no_threading ); - block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading ); - block_hopping_term_PRECISION( b3, b5, 0, _EVEN_SITES, s, l, no_threading ); + block_diag_oo_inv_PRECISION( &b5, &b3, 0, s, l, no_threading ); + block_hopping_term_PRECISION( &b3, &b5, 0, _EVEN_SITES, s, l, no_threading ); - vector_PRECISION_minus( b3, b2, b3, 0, s->block_vector_size, l ); - diff = global_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading ); + vector_PRECISION_minus( &b3, &b2, &b3, 0, s->block_vector_size, l ); + diff = global_norm_PRECISION( &b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( &b2, 0, s->block_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff ); - FREE( b1, complex_PRECISION, vs ); - FREE( b2, complex_PRECISION, vs ); - FREE( b3, complex_PRECISION, vs ); - FREE( b4, complex_PRECISION, vs ); - FREE( b5, complex_PRECISION, vs ); + FREE( b1.vector_buffer, complex_PRECISION, vs ); + FREE( b2.vector_buffer, complex_PRECISION, vs ); + FREE( b3.vector_buffer, complex_PRECISION, vs ); + FREE( b4.vector_buffer, complex_PRECISION, vs ); + FREE( b5.vector_buffer, complex_PRECISION, vs ); END_UNTHREADED_FUNCTION(threading) } @@ -2600,74 +2575,84 @@ void oddeven_PRECISION_test( level_struct *l ) { * - Compare solutions ( Difference should be close to 0 ). *********************************************************************************/ - vector_double d1=NULL, d2=NULL, d3=NULL; - vector_PRECISION f1=NULL, f2=NULL, f3=NULL, f4=NULL, f5=NULL; + vector_double d1, d2, d3; + vector_PRECISION f1, f2, f3, f4, f5; double diff; - MALLOC( d1, complex_double, l->inner_vector_size ); - MALLOC( d2, complex_double, l->inner_vector_size ); - MALLOC( d3, complex_double, l->inner_vector_size ); - MALLOC( f1, complex_PRECISION, l->inner_vector_size ); - MALLOC( f2, complex_PRECISION, l->inner_vector_size ); - MALLOC( f3, complex_PRECISION, l->inner_vector_size ); - MALLOC( f4, complex_PRECISION, l->inner_vector_size ); - MALLOC( f5, complex_PRECISION, l->inner_vector_size ); - - vector_double_define_random( d1, 0, l->inner_vector_size, l ); - serial_to_oddeven_PRECISION( f1, d1, l, no_threading ); + vector_double_init(&d1); + vector_double_init(&d2); + vector_double_init(&d3); + + vector_PRECISION_init(&f1); + vector_PRECISION_init(&f2); + vector_PRECISION_init(&f3); + vector_PRECISION_init(&f4); + vector_PRECISION_init(&f5); + + MALLOC( d1.vector_buffer, complex_double, l->inner_vector_size ); + MALLOC( d2.vector_buffer, complex_double, l->inner_vector_size ); + MALLOC( d3.vector_buffer, complex_double, l->inner_vector_size ); + MALLOC( f1.vector_buffer, complex_PRECISION, l->inner_vector_size ); + MALLOC( f2.vector_buffer, complex_PRECISION, l->inner_vector_size ); + MALLOC( f3.vector_buffer, complex_PRECISION, l->inner_vector_size ); + MALLOC( f4.vector_buffer, complex_PRECISION, l->inner_vector_size ); + MALLOC( f5.vector_buffer, complex_PRECISION, l->inner_vector_size ); + + vector_double_define_random( &d1, 0, l->inner_vector_size, l ); + serial_to_oddeven_PRECISION( &f1, &d1, l, no_threading ); - diag_ee_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); - diag_oo_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, no_threading ); + diag_ee_PRECISION( &f2, &f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); + diag_oo_PRECISION( &f2, &f1, &(l->oe_op_PRECISION), l, no_threading ); - hopping_term_PRECISION( f2, f1, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); + hopping_term_PRECISION( &f2, &f1, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); - d_plus_clover_double( d2, d1, &(g.op_double), l, no_threading ); - oddeven_to_serial_PRECISION( d1, f2, l, no_threading ); + d_plus_clover_double( &d2, &d1, &(g.op_double), l, no_threading ); + oddeven_to_serial_PRECISION( &d1, &f2, l, no_threading ); - vector_double_minus( d3, d1, d2, 0, l->num_inner_lattice_sites, l ); - diff = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading ); + vector_double_minus( &d3, &d1, &d2, 0, l->num_inner_lattice_sites, l ); + diff = global_norm_double( &d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( &d1, 0, l->num_inner_lattice_sites, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout: %le\n", l->depth, diff ); // -------------- - vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l ); - diag_oo_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading ); - diag_oo_inv_PRECISION( f4, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - vector_PRECISION_minus( f4, f4, f1, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &f4, &f1, 0, l->inner_vector_size, l ); + diag_oo_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), l, no_threading ); + diag_oo_inv_PRECISION( &f4, &f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + vector_PRECISION_minus( &f4, &f4, &f1, 0, l->inner_vector_size, l ); - diff = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )); + diff = (PRECISION) (global_norm_PRECISION( &f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f1, 0, l->inner_vector_size, l, no_threading )); test0_PRECISION("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, diff ); // transformation part - vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &f4, &f1, 0, l->inner_vector_size, l ); // even to odd // set odd part of f3 to 0. - vector_PRECISION_define( f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_define( &f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - hopping_term_PRECISION( f3, f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); - diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - vector_PRECISION_plus( f4, f4, f5, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + hopping_term_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); + diag_oo_inv_PRECISION( &f5, &f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + vector_PRECISION_plus( &f4, &f4, &f5, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); // block diagonal part - apply_schur_complement_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading ); - diag_oo_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading ); + apply_schur_complement_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), l, no_threading ); + diag_oo_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), l, no_threading ); // back transformation part - diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - hopping_term_PRECISION( f3, f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + diag_oo_inv_PRECISION( &f5, &f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + hopping_term_PRECISION( &f3, &f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); - vector_PRECISION_minus( f1, f2, f3, 0, l->inner_vector_size, l ); - diff = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading )); + vector_PRECISION_minus( &f1, &f2, &f3, 0, l->inner_vector_size, l ); + diff = (PRECISION) (global_norm_PRECISION( &f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f2, 0, l->inner_vector_size, l, no_threading )); test0_PRECISION("depth: %d, correctness of odd even schur complement: %le\n", l->depth, diff ); - FREE( d1, complex_double, l->inner_vector_size ); - FREE( d2, complex_double, l->inner_vector_size ); - FREE( d3, complex_double, l->inner_vector_size ); - FREE( f1, complex_PRECISION, l->inner_vector_size ); - FREE( f2, complex_PRECISION, l->inner_vector_size ); - FREE( f3, complex_PRECISION, l->inner_vector_size ); - FREE( f4, complex_PRECISION, l->inner_vector_size ); - FREE( f5, complex_PRECISION, l->inner_vector_size ); + FREE( d1.vector_buffer, complex_double, l->inner_vector_size ); + FREE( d2.vector_buffer, complex_double, l->inner_vector_size ); + FREE( d3.vector_buffer, complex_double, l->inner_vector_size ); + FREE( f1.vector_buffer, complex_PRECISION, l->inner_vector_size ); + FREE( f2.vector_buffer, complex_PRECISION, l->inner_vector_size ); + FREE( f3.vector_buffer, complex_PRECISION, l->inner_vector_size ); + FREE( f4.vector_buffer, complex_PRECISION, l->inner_vector_size ); + FREE( f5.vector_buffer, complex_PRECISION, l->inner_vector_size ); } diff --git a/src/oddeven_generic.h b/src/oddeven_generic.h index 4fac101..2d03e98 100644 --- a/src/oddeven_generic.h +++ b/src/oddeven_generic.h @@ -24,39 +24,39 @@ struct Thread; - void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, + void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ); void oddeven_free_PRECISION( level_struct *l ); - void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_struct *l, struct Thread *threading ); - void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_struct *l, struct Thread *threading ); + void oddeven_to_serial_PRECISION( vector_double *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ); + void serial_to_oddeven_PRECISION( vector_PRECISION *out, vector_double *in, level_struct *l, struct Thread *threading ); - void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ); - void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ); + void oddeven_to_block_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ); + void block_to_oddeven_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ); - void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_diag_ee_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void g5D_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l ); - void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start, + void apply_block_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, vector_PRECISION latest_iter, + void block_solve_oddeven_PRECISION( vector_PRECISION *phi, vector_PRECISION *r, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); void oddeven_PRECISION_test( level_struct *l ); diff --git a/src/operator_generic.c b/src/operator_generic.c index f881b23..408c83f 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -58,7 +58,7 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { for ( int i=0; i<8; i++ ) { op->c.boundary_table[i] = NULL; - op->c.buffer[i] = NULL; + vector_PRECISION_init(&(op->c.buffer[i])); op->c.in_use[i] = 0; } op->c.comm = 1; @@ -393,41 +393,45 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc int ivs = l->inner_vector_size; double diff; - vector_double vd1=NULL, vd2, vd3, vd4; - vector_PRECISION vp1=NULL, vp2; + vector_double vd1, vd2, vd3, vd4; + vector_PRECISION vp1, vp2; - PUBLIC_MALLOC( vd1, complex_double, 4*ivs ); - PUBLIC_MALLOC( vp1, complex_PRECISION, 2*ivs ); + vector_double_init(&vd1); + vector_PRECISION_init(&vp1); - vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs; + PUBLIC_MALLOC( vd1.vector_buffer, complex_double, 4*ivs ); + PUBLIC_MALLOC( vp1.vector_buffer, complex_PRECISION, 2*ivs ); + + vd2.vector_buffer = vd1.vector_buffer + ivs; vd3.vector_buffer = vd2.vector_buffer + ivs; + vd4.vector_buffer = vd3.vector_buffer + ivs; vp2.vector_buffer = vp1.vector_buffer + ivs; START_LOCKED_MASTER(threading) - vector_double_define_random( vd1, 0, l->inner_vector_size, l ); - apply_operator_double( vd2, vd1, &(g.p), l, no_threading ); + vector_double_define_random( &vd1, 0, l->inner_vector_size, l ); + apply_operator_double( &vd2, &vd1, &(g.p), l, no_threading ); - trans_PRECISION( vp1, vd1, op->translation_table, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading ); + trans_PRECISION( &vp1, &vd1, op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION( &vd3, &vp2, op->translation_table, l, no_threading ); - vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vd4, 0, ivs, l, no_threading )/ - global_norm_double( vd3, 0, ivs, l, no_threading ); + vector_double_minus( &vd4, &vd3, &vd2, 0, l->inner_vector_size, l ); + diff = global_norm_double( &vd4, 0, ivs, l, no_threading )/ + global_norm_double( &vd3, 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, threading ); + apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) START_LOCKED_MASTER(threading) - trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading ); - vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vd4, 0, ivs, l, no_threading ) / - global_norm_double( vd3, 0, ivs, l, no_threading ); + trans_back_PRECISION( &vd3, &vp2, op->translation_table, l, no_threading ); + vector_double_minus( &vd4, &vd3, &vd2, 0, l->inner_vector_size, l ); + diff = global_norm_double( &vd4, 0, ivs, l, no_threading ) / + global_norm_double( &vd3, 0, ivs, l, no_threading ); if ( diff > EPS_PRECISION ) printf0("\x1b[31m"); @@ -439,8 +443,8 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc END_LOCKED_MASTER(threading) } - PUBLIC_FREE( vd1, complex_double, 4*ivs ); - PUBLIC_FREE( vp1, complex_PRECISION, 2*ivs ); + PUBLIC_FREE( vd1.vector_buffer, complex_double, 4*ivs ); + PUBLIC_FREE( vp1.vector_buffer, complex_PRECISION, 2*ivs ); START_LOCKED_MASTER(threading) if ( g.method >=4 && g.odd_even ) diff --git a/src/preconditioner.c b/src/preconditioner.c index d7065d4..bd2a401 100644 --- a/src/preconditioner.c +++ b/src/preconditioner.c @@ -22,19 +22,19 @@ #include "main.h" #include "preconditioner.h" -void preconditioner( vector_double phi, vector_double Dphi, vector_double eta, +void preconditioner( vector_double *phi, vector_double *Dphi, vector_double *eta, const int res, level_struct *l, struct Thread *threading ) { if ( g.method == 0 ) vector_double_copy( phi, eta, threading->start_index[l->depth], threading->end_index[l->depth], l ); else if ( g.method < 5 || g.method == 6 || !g.odd_even ) { if ( g.mixed_precision ) { - trans_float( l->sbuf_float[0], eta, l->s_float.op.translation_table, l, threading ); - vcycle_float( l->sbuf_float[1], NULL, l->sbuf_float[0], res, l, threading ); - trans_back_float( phi, l->sbuf_float[1], l->s_float.op.translation_table, l, threading ); + trans_float( &(l->sbuf_float[0]), eta, l->s_float.op.translation_table, l, threading ); + vcycle_float( &(l->sbuf_float[1]), NULL, &(l->sbuf_float[0]), res, l, threading ); + trans_back_float( phi, &(l->sbuf_float[1]), l->s_float.op.translation_table, l, threading ); } else { - trans_double( l->sbuf_double[0], eta, l->s_double.op.translation_table, l, threading ); - vcycle_double( l->sbuf_double[1], NULL, l->sbuf_double[0], res, l, threading ); - trans_back_double( phi, l->sbuf_double[1], l->s_double.op.translation_table, l, threading ); + trans_double( &(l->sbuf_double[0]), eta, l->s_double.op.translation_table, l, threading ); + vcycle_double( &(l->sbuf_double[1]), NULL, &(l->sbuf_double[0]), res, l, threading ); + trans_back_double( phi, &(l->sbuf_double[1]), l->s_double.op.translation_table, l, threading ); } } else { if ( g.mixed_precision ) { @@ -42,25 +42,25 @@ void preconditioner( vector_double phi, vector_double Dphi, vector_double eta, l->sp_float.num_restart = l->n_cy; l->sp_float.initial_guess_zero = res; END_LOCKED_MASTER(threading) - serial_to_oddeven_float( l->sp_float.b, eta, l, threading ); + serial_to_oddeven_float( &(l->sp_float.b), eta, l, threading ); if ( g.method == 6 ) { g5D_solve_oddeven_float( &(l->sp_float), &(l->oe_op_float), l, threading ); } else { solve_oddeven_float( &(l->sp_float), &(l->oe_op_float), l, threading ); } - oddeven_to_serial_float( phi, l->sp_float.x, l, threading ); + oddeven_to_serial_float( phi, &(l->sp_float.x), l, threading ); } else { START_LOCKED_MASTER(threading) l->sp_double.num_restart = l->n_cy; l->sp_double.initial_guess_zero = res; END_LOCKED_MASTER(threading) - serial_to_oddeven_double( l->sp_double.b, eta, l, threading ); + serial_to_oddeven_double( &(l->sp_double.b), eta, l, threading ); if ( g.method == 6 ) { g5D_solve_oddeven_double( &(l->sp_double), &(l->oe_op_double), l, threading ); } else { solve_oddeven_double( &(l->sp_double), &(l->oe_op_double), l, threading ); } - oddeven_to_serial_double( phi, l->sp_double.x, l, threading ); + oddeven_to_serial_double( phi, &(l->sp_double.x), l, threading ); } } diff --git a/src/preconditioner.h b/src/preconditioner.h index 783c70c..d3f0b02 100644 --- a/src/preconditioner.h +++ b/src/preconditioner.h @@ -29,6 +29,6 @@ #include "schwarz_float.h" #include "schwarz_double.h" - void preconditioner( vector_double phi, vector_double Dphi, vector_double eta, + void preconditioner( vector_double *phi, vector_double *Dphi, vector_double *eta, const int res, level_struct *l, struct Thread *threading ); #endif diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index 01becd3..ec64fa5 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -51,16 +51,16 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) { s->index[T] = NULL; s->oe_index[T] = NULL; s->block = NULL; - s->buf1 = NULL; - s->buf2 = NULL; - s->buf3 = NULL; - s->buf4 = NULL; - s->buf5 = NULL; - l->sbuf_PRECISION[0] = NULL; - s->oe_buf[0] = NULL; - s->oe_buf[1] = NULL; - s->oe_buf[2] = NULL; - s->oe_buf[3] = NULL; + vector_PRECISION_init(&(s->buf1)); + vector_PRECISION_init(&(s->buf2)); + vector_PRECISION_init(&(s->buf3)); + vector_PRECISION_init(&(s->buf4)); + vector_PRECISION_init(&(s->buf5)); + vector_PRECISION_init(&(l->sbuf_PRECISION[0])); + vector_PRECISION_init(&(s->oe_buf[0])); + vector_PRECISION_init(&(s->oe_buf[1])); + vector_PRECISION_init(&(s->oe_buf[2])); + vector_PRECISION_init(&(s->oe_buf[3])); s->local_minres_buffer[0] = NULL; s->local_minres_buffer[1] = NULL; s->local_minres_buffer[2] = NULL; @@ -148,10 +148,11 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { #endif if ( l->depth == 0 ) { - MALLOC( s->oe_buf[0], complex_PRECISION, 4*vs ); - s->oe_buf[1] = s->oe_buf[0] + vs; - s->oe_buf[2] = s->oe_buf[1] + vs; - s->oe_buf[3] = s->oe_buf[2] + vs; + vector_PRECISION_init(&(s->oe_buf[0])); + MALLOC( s->oe_buf[0].vector_buffer, complex_PRECISION, 4*vs ); + s->oe_buf[1].vector_buffer = s->oe_buf[0].vector_buffer + vs; + s->oe_buf[2].vector_buffer = s->oe_buf[1].vector_buffer + vs; + s->oe_buf[3].vector_buffer = s->oe_buf[2].vector_buffer + vs; } n = 0; @@ -172,17 +173,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { s->block[i].bt = NULL; MALLOC( s->block[i].bt, int, n ); } - - MALLOC( s->buf1, complex_PRECISION, vs+3*svs ); - s->buf2 = s->buf1 + vs; - s->buf3 = s->buf2 + svs; - s->buf4 = s->buf3 + svs; + vector_PRECISION_init(&(s->buf1)); + MALLOC( s->buf1.vector_buffer, complex_PRECISION, vs+3*svs ); + s->buf2.vector_buffer = s->buf1.vector_buffer + vs; + s->buf3.vector_buffer = s->buf2.vector_buffer + svs; + s->buf4.vector_buffer = s->buf3.vector_buffer + svs; - if ( g.method == 1 ) - MALLOC( s->buf5, complex_PRECISION, svs ); - - MALLOC( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs ); - l->sbuf_PRECISION[1] = l->sbuf_PRECISION[0] + vs; + if ( g.method == 1 ){ + vector_PRECISION_init(&(s->buf5)); + MALLOC( s->buf5.vector_buffer, complex_PRECISION, svs ); + } + vector_PRECISION_init(&(l->sbuf_PRECISION[0])); + MALLOC( l->sbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*vs ); + l->sbuf_PRECISION[1].vector_buffer = l->sbuf_PRECISION[0].vector_buffer + vs; // these buffers are introduced to make local_minres_PRECISION thread-safe MALLOC( s->local_minres_buffer[0], complex_PRECISION, svs ); @@ -259,25 +262,26 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { vs *= 2; #endif if ( l->depth == 0 ) { - s->oe_buf[1] = NULL; - s->oe_buf[2] = NULL; - s->oe_buf[3] = NULL; - FREE( s->oe_buf[0], complex_PRECISION, 4*vs ); - s->oe_buf[0] = NULL; + vector_PRECISION_init(&(s->oe_buf[1])); + vector_PRECISION_init(&(s->oe_buf[2])); + vector_PRECISION_init(&(s->oe_buf[3])); + FREE( s->oe_buf[0].vector_buffer, complex_PRECISION, 4*vs ); + vector_PRECISION_init(&(s->oe_buf[0])); } - FREE( s->buf1, complex_PRECISION, vs+3*svs ); - s->buf2 = NULL; s->buf3 = NULL; - s->buf4 = NULL; + FREE( s->buf1.vector_buffer, complex_PRECISION, vs+3*svs ); + vector_PRECISION_init(&(s->buf2)); + vector_PRECISION_init(&(s->buf3)); + vector_PRECISION_init(&(s->buf4)); if ( g.method == 1 ) - FREE( s->buf5, complex_PRECISION, svs ); + FREE( s->buf5.vector_buffer, complex_PRECISION, svs ); operator_PRECISION_free( &(s->op), _SCHWARZ, l ); - FREE( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs ); - l->sbuf_PRECISION[1] = NULL; + FREE( l->sbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*vs ); + vector_PRECISION_init(&(l->sbuf_PRECISION[1])); FREE( s->local_minres_buffer[0], complex_PRECISION, svs ); FREE( s->local_minres_buffer[1], complex_PRECISION, svs ); @@ -649,7 +653,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc int i, t, z, y, x, mu, nu, index, *it = s->op.index_table, *dt = s->op.table_dim, ls[4], le[4], buf_length[4], link_size; - vector_PRECISION buf[4] = {NULL,NULL,NULL,NULL}, rbuf[4] = {NULL,NULL,NULL,NULL}; + buffer_PRECISION buf[4] = {NULL,NULL,NULL,NULL}, rbuf[4] = {NULL,NULL,NULL,NULL}; config_PRECISION D=s->op.D; for ( mu=0; mu<4; mu++ ) { @@ -694,7 +698,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc for ( y=ls[Y]; yneighbor_rank[2*mu], @@ -715,7 +719,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc for ( y=ls[Y]; yblock_boundary_length; @@ -743,15 +747,15 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in PRECISION *Dminus = s->op.D_transformed_vectorized; for ( int mu=0; mu<4; mu++ ) { - boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, + boundary_plus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dplus, (PRECISION*)phi->vector_buffer, mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, + boundary_minus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dminus, (PRECISION*)phi->vector_buffer, mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); } #else int i, mu, index, neighbor_index; config_PRECISION D_pt, D = s->op.D; - vector_PRECISION phi_pt, eta_pt; + buffer_PRECISION phi_pt, eta_pt; #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -762,8 +766,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_T_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -776,8 +780,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_T_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -792,8 +796,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Z_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -806,8 +810,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Z_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -822,8 +826,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Y_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -836,8 +840,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Y_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -852,8 +856,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_X_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -866,8 +870,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_X_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -884,8 +888,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_T_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -896,8 +900,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_T_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -910,8 +914,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Z_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -922,8 +926,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Z_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -936,8 +940,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Y_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -948,8 +952,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Y_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -962,8 +966,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_X_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -974,8 +978,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_X_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -988,7 +992,7 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in } -void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, +void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length; @@ -997,15 +1001,15 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, PRECISION *Dminus = s->op.D_transformed_vectorized; for ( int mu=0; mu<4; mu++ ) { - boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, + boundary_nplus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dplus, (PRECISION*)phi->vector_buffer, mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, + boundary_nminus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dminus, (PRECISION*)phi->vector_buffer, mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); } #else int i, mu, index, neighbor_index; config_PRECISION D_pt, D = s->op.D; - vector_PRECISION phi_pt, eta_pt; + buffer_PRECISION phi_pt, eta_pt; #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -1016,8 +1020,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_T_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1030,8 +1034,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_T_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1046,8 +1050,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Z_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1060,8 +1064,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Z_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1076,8 +1080,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Y_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1090,8 +1094,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Y_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1106,8 +1110,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_X_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1120,8 +1124,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_X_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1138,8 +1142,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_T_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1150,8 +1154,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_T_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1164,8 +1168,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Z_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1176,8 +1180,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Z_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1190,8 +1194,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Y_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1202,8 +1206,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Y_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1216,8 +1220,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_X_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1228,8 +1232,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_X_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1242,7 +1246,7 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, } -void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, +void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; @@ -1258,17 +1262,19 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION for ( int i=bbl[2*mu]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l ); + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; + coarse_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dplus + 4*vectorized_link_offset*index, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; + coarse_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); } } #else @@ -1280,26 +1286,28 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION for ( int i=bbl[2*mu]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*index + link_size*mu; - coarse_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu; - coarse_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } } #endif } -void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, +void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; @@ -1314,17 +1322,19 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO for ( int i=bbl[2*mu]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l ); + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; + coarse_n_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dplus + 4*vectorized_link_offset*index, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; + coarse_n_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); } } #else @@ -1336,19 +1346,21 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO for ( int i=bbl[2*mu]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*index + link_size*mu; - coarse_n_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_n_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu; - coarse_n_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_n_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } } #endif @@ -1412,18 +1424,19 @@ void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struc } -void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void additive_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) int k, mu, i, nb = s->num_blocks; - vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3, latest_iter2 = s->buf5, swap = NULL; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3), *latest_iter2 = &(s->buf5), *swap = NULL; void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, - (* block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + //vector_PRECISION_init(swap); int nb_thread_start; int nb_thread_end; @@ -1594,16 +1607,13 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v } -void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) int k=0, mu, i, init_res = res, res_comm = res, step; - vector_PRECISION r = s->buf1; - vector_PRECISION Dphi = s->buf4; - vector_PRECISION latest_iter = s->buf2; - vector_PRECISION x = s->buf3; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3); void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, @@ -1612,7 +1622,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, int commdir[8] = {+1,-1,-1,+1,-1,+1,+1,-1}; SYNC_CORES(threading) - + int block_thread_start[8], block_thread_end[8]; for ( i=0; i<8; i++ ) compute_core_start_end_custom(0, s->block_list_length[i], block_thread_start+i, block_thread_end+i, l, threading, 1 ); @@ -1639,18 +1649,23 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, // perform the Schwarz iteration, solve the block systems for ( k=0; kblock_list[step][i]; + printf0("index: %d\n", index); START_MASTER(threading) PROF_PRECISION_START( _SM3 ); END_MASTER(threading) if ( res == _RES ) { if ( k==0 && init_res == _RES ) { + printf0("calling block_op\n"); block_op( Dphi, x, s->block[index].start*l->num_lattice_site_var, s, l, no_threading ); boundary_op( Dphi, x, index, s, l, no_threading ); vector_PRECISION_minus( r, eta, Dphi, s->block[index].start*l->num_lattice_site_var, s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l ); } else { + printf0("calling n_boundary\n"); n_boundary_op( r, latest_iter, index, s, l ); } } @@ -1658,14 +1673,16 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, PROF_PRECISION_STOP( _SM3, 1 ); PROF_PRECISION_START( _SM4 ); END_MASTER(threading) - // local minres updates x, r and latest iter + // local minres updates x, r and latest iter block_solve( x, r, latest_iter, s->block[index].start*l->num_lattice_site_var, s, l, no_threading ); - START_MASTER(threading) + START_MASTER(threading) PROF_PRECISION_STOP( _SM4, 1 ); END_MASTER(threading) + printf0(" fin index %d\n", i); } if ( res_comm == _RES && !(k==cycles-1 && (step==6||step==7) && D_phi==NULL) ) { + printf0("calling comms\n"); START_LOCKED_MASTER(threading) for ( mu=0; mu<4; mu++ ) { communicate[(step%4)/2]( (k==0 && step < 6 && init_res == _RES)?x:latest_iter, mu, commdir[step], &(s->op.c), l ); @@ -1764,16 +1781,13 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, } -void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) int color, k, mu, i, nb = s->num_blocks, init_res = res; - vector_PRECISION r = s->buf1; - vector_PRECISION Dphi = s->buf4; - vector_PRECISION latest_iter = s->buf2; - vector_PRECISION x = s->buf3; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3); void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, @@ -1980,7 +1994,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE } -void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) @@ -1989,7 +2003,7 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p if ( s->num_colors == 2 ) schwarz_PRECISION( phi, D_phi, eta, cycles, res, s, l, no_threading ); else { int color, k, mu, i, nb = s->num_blocks; - vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3); void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, @@ -2101,10 +2115,11 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p #ifdef SCHWARZ_RES START_LOCKED_MASTER(threading) - vector_PRECISION true_r = NULL; + vector_PRECISION true_r; + vector_PRECISION_init(&true_r); - PUBLIC_MALLOC( true_r, complex_PRECISION, l->vector_size ); - vector_PRECISION_define( true_r, 0, 0, l->inner_vector_size, l ); + PUBLIC_MALLOC( true_r.vector_buffer, complex_PRECISION, l->vector_size ); + vector_PRECISION_define( &true_r, 0, 0, l->inner_vector_size, l ); if ( D_phi == NULL ) { @@ -2113,24 +2128,24 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p ghost_update_PRECISION( x, mu, -1, &(s->op.c), l ); } for ( i=0; iblock[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_op( &true_r, x, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); } for ( mu=0; mu<4; mu++ ) { ghost_update_wait_PRECISION( x, mu, +1, &(s->op.c), l ); ghost_update_wait_PRECISION( x, mu, -1, &(s->op.c), l ); } for ( i=0; iinner_vector_size, l ); - PRECISION r_norm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading ), + vector_PRECISION_saxpy( &true_r, eta, &true_r, -1, 0, l->inner_vector_size, l ); + PRECISION r_norm = global_norm_PRECISION( &true_r, 0, l->inner_vector_size, l, no_threading ), den = global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading ); r_norm/=den; char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); - PUBLIC_FREE( true_r, complex_PRECISION, l->vector_size ); + PUBLIC_FREE( true_r.vector_buffer, complex_PRECISION, l->vector_size ); END_LOCKED_MASTER(threading) #endif } @@ -2139,10 +2154,10 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p } -void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_struct *l, struct Thread *threading ) { +void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ) { int i, index; - vector_PRECISION out_pt = out; vector_double in_pt = in; + buffer_PRECISION out_pt = out->vector_buffer; buffer_double in_pt = in->vector_buffer; int start = threading->start_site[l->depth]; int end = threading->end_site[l->depth]; @@ -2153,16 +2168,16 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str if( g.n_flavours == 2 ) for ( i=start; ivector_buffer + 24*index; + in_pt = in->vector_buffer + 24*i; FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; ) } else #endif for ( i=start; ivector_buffer + 12*index; + in_pt = in->vector_buffer + 12*i; FOR12( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; ) } END_NO_HYPERTHREADS(threading) @@ -2170,10 +2185,10 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str } -void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, level_struct *l, struct Thread *threading ) { +void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ) { int i, index; - vector_double out_pt = out; vector_PRECISION in_pt = in; + buffer_double out_pt = out->vector_buffer; buffer_PRECISION in_pt = in->vector_buffer; int start = threading->start_site[l->depth]; int end = threading->end_site[l->depth]; @@ -2184,16 +2199,16 @@ void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, leve if( g.n_flavours == 2 ) for ( i=start; ivector_buffer + 24*index; + out_pt = out->vector_buffer + 24*i; FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; ) } else #endif for ( i=start; ivector_buffer + 12*index; + out_pt = out->vector_buffer + 12*i; FOR12( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; ) } END_NO_HYPERTHREADS(threading) @@ -2222,42 +2237,45 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l void (*op)() = (l->depth==0)?d_plus_clover_PRECISION:apply_coarse_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op; - vector_PRECISION v1 = NULL, v2 = NULL, v3 = NULL; + vector_PRECISION v1, v2, v3; PRECISION diff; - MALLOC( v1, complex_PRECISION, svs ); - MALLOC( v2, complex_PRECISION, vs ); - MALLOC( v3, complex_PRECISION, vs ); + vector_PRECISION_init(&v1); + vector_PRECISION_init(&v2); + vector_PRECISION_init(&v3); - vector_PRECISION_define_random( v1, 0, ivs, l ); + MALLOC( v1.vector_buffer, complex_PRECISION, svs ); + MALLOC( v2.vector_buffer, complex_PRECISION, vs ); + MALLOC( v3.vector_buffer, complex_PRECISION, vs ); - op( v3, v1, &(s->op), l, no_threading ); + vector_PRECISION_define_random( &v1, 0, ivs, l ); + + op( &v3, &v1, &(s->op), l, no_threading ); for ( mu=0; mu<4; mu++ ) { - ghost_update_PRECISION( v1, mu, +1, &(s->op.c), l ); - ghost_update_PRECISION( v1, mu, -1, &(s->op.c), l ); + ghost_update_PRECISION( &v1, mu, +1, &(s->op.c), l ); + ghost_update_PRECISION( &v1, mu, -1, &(s->op.c), l ); } for ( mu=0; mu<4; mu++ ) { - ghost_update_wait_PRECISION( v1, mu, +1, &(s->op.c), l ); - ghost_update_wait_PRECISION( v1, mu, -1, &(s->op.c), l ); + ghost_update_wait_PRECISION( &v1, mu, +1, &(s->op.c), l ); + ghost_update_wait_PRECISION( &v1, mu, -1, &(s->op.c), l ); } for ( i=0; iblock[i].start*l->num_lattice_site_var, s, l, no_threading ); - boundary_op( v2, v1, i, s, l, no_threading ); + block_op( &v2, &v1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + boundary_op( &v2, &v1, i, s, l, no_threading ); } - vector_PRECISION_minus( v3, v3, v2, 0, l->inner_vector_size, l ); - diff = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading ) / - global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading ); + vector_PRECISION_minus( &v3, &v3, &v2, 0, l->inner_vector_size, l ); + diff = global_norm_PRECISION( &v3, 0, l->inner_vector_size, l, no_threading ) / + global_norm_PRECISION( &v2, 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of local residual vector: %le\n", l->depth, diff ); - FREE( v1, complex_PRECISION, l->schwarz_vector_size ); - FREE( v2, complex_PRECISION, l->vector_size ); - FREE( v3, complex_PRECISION, l->vector_size ); + FREE( v1.vector_buffer, complex_PRECISION, l->schwarz_vector_size ); + FREE( v2.vector_buffer, complex_PRECISION, l->vector_size ); + FREE( v3.vector_buffer, complex_PRECISION, l->vector_size ); END_UNTHREADED_FUNCTION(threading) } - diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h index fab1613..1fb734a 100644 --- a/src/schwarz_generic.h +++ b/src/schwarz_generic.h @@ -24,13 +24,13 @@ struct Thread; - void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, + void block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); - void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, + void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); - void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, + void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); - void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, + void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); void smoother_PRECISION_def( level_struct *l ); @@ -46,17 +46,17 @@ struct Thread; void schwarz_PRECISION_def( schwarz_PRECISION_struct *s, operator_double_struct *op, level_struct *l ); void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ); - void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void additive_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_struct *l, struct Thread *threading ); - void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, level_struct *l, struct Thread *threading ); + void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ); + void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ); void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); diff --git a/src/setup_generic.c b/src/setup_generic.c index 4493bae..07a53d7 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -23,7 +23,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ); void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ); -void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct *l, struct Thread *threading ); +void testvector_analysis_PRECISION( vector_PRECISION **test_vectors, level_struct *l, struct Thread *threading ); void read_tv_from_file_PRECISION( level_struct *l, struct Thread *threading ); void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *threading ) { @@ -91,12 +91,12 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr SYNC_HYPERTHREADS(threading) if ( !l->idle ) { for ( int i=0; inext_level->num_eig_vect,l->num_eig_vect); i++ ) { - restrict_PRECISION( l->next_level->is_PRECISION.test_vector[i], l->is_PRECISION.test_vector[i], l, threading ); + restrict_PRECISION( &(l->next_level->is_PRECISION.test_vector[i]), &(l->is_PRECISION.test_vector[i]), l, threading ); } START_LOCKED_MASTER(threading) for ( int i=MIN(l->next_level->num_eig_vect,l->num_eig_vect); inext_level->num_eig_vect; i++ ) { if ( !l->next_level->idle ) - vector_PRECISION_define_random( l->next_level->is_PRECISION.test_vector[i], 0, + vector_PRECISION_define_random( &(l->next_level->is_PRECISION.test_vector[i]), 0, l->next_level->inner_vector_size, l->next_level ); } END_LOCKED_MASTER(threading) @@ -121,7 +121,7 @@ void iterative_PRECISION_setup( int setup_iter, level_struct *l, struct Thread * level_struct *lp = l; while( lp->level > 0 ) { - testvector_analysis_PRECISION( lp->is_PRECISION.test_vector, lp, threading ); + testvector_analysis_PRECISION( &(lp->is_PRECISION.test_vector), lp, threading ); lp = lp->next_level; if ( lp == NULL ) break; @@ -142,18 +142,19 @@ void read_tv_from_file_PRECISION( level_struct *l, struct Thread *threading ) { int n = l->num_eig_vect, i; char filename[STRINGLENGTH+1]; - vector_double tmp = NULL; + vector_double tmp; + vector_double_init(&tmp); - MALLOC( tmp, complex_double, l->inner_vector_size ); + MALLOC( tmp.vector_buffer, complex_double, l->inner_vector_size ); for ( i=0; iis_PRECISION.test_vector[i], tmp, l->s_PRECISION.op.translation_table, l, no_threading ); + vector_io( (double*)tmp.vector_buffer, filename, _READ, l ); + trans_PRECISION( &(l->is_PRECISION.test_vector[i]), &tmp, l->s_PRECISION.op.translation_table, l, no_threading ); } - FREE( tmp, complex_double, l->inner_vector_size ); + FREE( tmp.vector_buffer, complex_double, l->inner_vector_size ); END_LOCKED_MASTER(threading) @@ -197,20 +198,21 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T int pi = 1, pn = n*6; #endif vector_PRECISION *buffer = NULL; + int start = threading->start_index[l->depth]; int end = threading->end_index[l->depth]; if ( V == NULL ) { - PUBLIC_MALLOC( buffer, complex_PRECISION*, 3 ); + PUBLIC_MALLOC( buffer, vector_PRECISION, 3 ); START_MASTER(threading) - buffer[0] = NULL; + vector_PRECISION_init(&buffer[0]); END_MASTER(threading) - PUBLIC_MALLOC( buffer[0], complex_PRECISION, l->vector_size*3 ); + PUBLIC_MALLOC( buffer[0].vector_buffer, complex_PRECISION, l->vector_size*3 ); START_MASTER(threading) for( i=1; i<3; i++) - buffer[i] = buffer[0] + l->vector_size*i; + buffer[i].vector_buffer = buffer[0].vector_buffer + l->vector_size*i; if ( g.print > 0 ) printf0("initial definition --- depth: %d\n", l->depth ); #ifdef DEBUG if ( g.print > 0 ) { printf0("\033[0;42m\033[1;37m|"); fflush(0); } @@ -221,16 +223,16 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T for ( k=0; kdepth == 0 ) { START_LOCKED_MASTER(threading) - vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l ); + vector_PRECISION_define_random( &(l->is_PRECISION.test_vector[k]), 0, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) // } - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], 1, _NO_RES, l, threading ); - vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:2, _NO_RES, l, threading ); - vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:3, _NO_RES, l, threading ); - vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); + smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), 1, _NO_RES, l, threading ); + vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l ); + smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), g.method>=4?1:2, _NO_RES, l, threading ); + vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l ); + smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), g.method>=4?1:3, _NO_RES, l, threading ); + vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l ); pc += 6; #ifdef DEBUG @@ -240,12 +242,12 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T #endif } - PUBLIC_FREE( buffer[0], complex_PRECISION, l->vector_size*3 ); - PUBLIC_FREE( buffer, complex_PRECISION*, 3 ); + PUBLIC_FREE( buffer[0].vector_buffer, complex_PRECISION, l->vector_size*3 ); + PUBLIC_FREE( buffer, vector_PRECISION, 3 ); for ( k=0; kis_PRECISION.test_vector[k], l->is_PRECISION.test_vector[k], - 1.0/global_norm_PRECISION( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l, threading ), + vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[k]), &(l->is_PRECISION.test_vector[k]), + 1.0/global_norm_PRECISION( &(l->is_PRECISION.test_vector[k]), 0, l->inner_vector_size, l, threading ), start, end, l ); } @@ -257,26 +259,26 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T } else { for ( i=0; iis_PRECISION.test_vector[i], V[i], l->s_PRECISION.op.translation_table, l, threading ); + trans_PRECISION( &(l->is_PRECISION.test_vector[i]), &V[i], l->s_PRECISION.op.translation_table, l, threading ); } } #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION for ( k=0; kis_PRECISION.interpolation[k], l->is_PRECISION.test_vector[k], start, end, l ); + vector_PRECISION_copy( &(l->is_PRECISION.interpolation[k]), &(l->is_PRECISION.test_vector[k]), start, end, l ); } #endif - testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading ); + testvector_analysis_PRECISION( &(l->is_PRECISION.test_vector), l, threading ); #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); + define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading ); #else gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading ); - define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); + define_interpolation_PRECISION_operator( &(l->is_PRECISION.interpolation->vector_buffer), l, threading ); #endif } @@ -287,7 +289,7 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); + define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); @@ -295,13 +297,13 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { START_LOCKED_MASTER(threading) #else for ( int i=0; inum_eig_vect; i++ ) { - vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i], + vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]), threading->start_index[l->depth], threading->end_index[l->depth], l ); } gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); - define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); + define_interpolation_PRECISION_operator( &(l->is_PRECISION.interpolation->vector_buffer), l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); #endif @@ -331,13 +333,14 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ) { if ( !l->idle ) { - vector_PRECISION buf1 = NULL; + vector_PRECISION buf1; gmres_PRECISION_struct gmres; // TODO: bugfix - threading, etc START_LOCKED_MASTER(threading) - MALLOC( buf1, complex_PRECISION, l->vector_size ); + vector_PRECISION_init(&buf1); + MALLOC( buf1.vector_buffer, complex_PRECISION, l->vector_size ); fgmres_PRECISION_struct_init( &gmres ); fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, apply_coarse_operator_PRECISION, &gmres, l->next_level ); @@ -358,7 +361,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s #endif END_MASTER(threading) for ( int i=0; inum_eig_vect; i++ ) { - restrict_PRECISION( gmres.b, l->is_PRECISION.test_vector[i], l, threading ); + restrict_PRECISION( &(gmres.b), &(l->is_PRECISION.test_vector[i]), l, threading ); if ( !l->next_level->idle ) { if ( g.odd_even && l->next_level->level == 0 ) { coarse_solve_odd_even_PRECISION( &gmres, &(l->next_level->oe_op_PRECISION), l->next_level, threading ); @@ -366,10 +369,10 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s fgmres_PRECISION( &gmres, l->next_level, threading ); } } - interpolate3_PRECISION( buf1, gmres.x, l, threading ); - smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, l, threading ); - vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], buf1, - 1.0/global_norm_PRECISION( buf1, 0, l->inner_vector_size, l, threading ), + interpolate3_PRECISION( &buf1, &(gmres.x), l, threading ); + smoother_PRECISION( &buf1, NULL, &(l->is_PRECISION.test_vector[i]), l->post_smooth_iter, _RES, l, threading ); + vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[i]), &buf1, + 1.0/global_norm_PRECISION( &buf1, 0, l->inner_vector_size, l, threading ), threading->start_index[l->depth], threading->end_index[l->depth], l ); pc += l->post_smooth_iter; #ifdef DEBUG @@ -385,7 +388,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s #endif #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); + define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); @@ -393,12 +396,12 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s START_LOCKED_MASTER(threading) #else for ( int i=0; inum_eig_vect; i++ ) - vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i], + vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]), threading->start_index[l->depth], threading->end_index[l->depth], l ); gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); - define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); + define_interpolation_PRECISION_operator( &(l->is_PRECISION.interpolation->vector_buffer), l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); #endif @@ -425,7 +428,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s inv_iter_2lvl_extension_setup_PRECISION( setup_iter, l->next_level, threading ); START_LOCKED_MASTER(threading) - FREE( buf1, complex_PRECISION, l->vector_size ); + FREE( buf1.vector_buffer, complex_PRECISION, l->vector_size ); fgmres_PRECISION_struct_free( &gmres, l ); END_LOCKED_MASTER(threading) } @@ -448,17 +451,19 @@ void test_vector_PRECISION_update( int i, level_struct *l, struct Thread *thread test_vector_PRECISION_update( i, l->next_level, threading ); if ( !l->idle ) - vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], l->p_PRECISION.x, - 1.0/global_norm_PRECISION( l->p_PRECISION.x, 0, l->inner_vector_size, l, threading ), + vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[i]), &(l->p_PRECISION.x), + 1.0/global_norm_PRECISION( &(l->p_PRECISION.x), 0, l->inner_vector_size, l, threading ), threading->start_index[l->depth], threading->end_index[l->depth], l ); } void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ) { - vector_PRECISION v_buf = NULL; + vector_PRECISION v_buf; complex_PRECISION *buffer = NULL; + vector_PRECISION_init(&v_buf); + PUBLIC_MALLOC( buffer, complex_PRECISION, 2*l->num_eig_vect ); START_LOCKED_MASTER(threading) @@ -467,7 +472,7 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre END_LOCKED_MASTER(threading) SYNC_MASTER_TO_ALL(threading) - PUBLIC_MALLOC( v_buf, complex_PRECISION, l->vector_size ); + PUBLIC_MALLOC( v_buf.vector_buffer, complex_PRECISION, l->vector_size ); if ( !l->idle ) { for ( int j=0; jis_PRECISION.test_vector, buffer, 0, l->num_eig_vect, l, threading ); for ( int i=0; inum_eig_vect; i++ ) { - vcycle_PRECISION( l->p_PRECISION.x, NULL, l->is_PRECISION.test_vector[i], _NO_RES, l, threading ); + vcycle_PRECISION( &(l->p_PRECISION.x), NULL, &(l->is_PRECISION.test_vector[i]), _NO_RES, l, threading ); test_vector_PRECISION_update( i, l, threading ); @@ -516,7 +521,7 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre } } - PUBLIC_FREE( v_buf, complex_PRECISION, l->vector_size ); + PUBLIC_FREE( v_buf.vector_buffer, complex_PRECISION, l->vector_size ); PUBLIC_FREE( buffer, complex_PRECISION, 2*l->num_eig_vect ); if ( l->depth == 0 ) { @@ -527,7 +532,7 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre } -void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct *l, struct Thread *threading ) { +void testvector_analysis_PRECISION( vector_PRECISION **test_vectors, level_struct *l, struct Thread *threading ) { #ifdef TESTVECTOR_ANALYSIS START_UNTHREADED_FUNCTION(threading) if ( l->depth == 0 ) { @@ -537,12 +542,12 @@ void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth ); for ( int i=0; inum_eig_vect; i++ ) { printf0("vector #%02d: ", i+1 ); - apply_operator_PRECISION( l->vbuf_PRECISION[3], test_vectors[i], &(l->p_PRECISION), l, no_threading ); - coarse_gamma5_PRECISION( l->vbuf_PRECISION[0], l->vbuf_PRECISION[3], 0, l->inner_vector_size, l ); - lambda = global_inner_product_PRECISION( test_vectors[i], l->vbuf_PRECISION[0], 0, l->inner_vector_size, l, no_threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[3]), test_vectors[i], &(l->p_PRECISION), l, no_threading ); + coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[0]), &(l->vbuf_PRECISION[3]), 0, l->inner_vector_size, l ); + lambda = global_inner_product_PRECISION( test_vectors[i], &(l->vbuf_PRECISION[0]), 0, l->inner_vector_size, l, no_threading ); lambda /= global_inner_product_PRECISION( test_vectors[i], test_vectors[i], 0, l->inner_vector_size, l, no_threading ); - vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l ); - mu = global_norm_PRECISION( l->vbuf_PRECISION[1], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading ); + vector_PRECISION_saxpy( &(l->vbuf_PRECISION[1]), &(l->vbuf_PRECISION[0]), test_vectors[i], -lambda, 0, l->inner_vector_size, l ); + mu = global_norm_PRECISION( &(l->vbuf_PRECISION[1]), 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading ); printf0("singular value: %+lf%+lfi, singular vector precision: %le\n", (double)creal(lambda), (double)cimag(lambda), (double)mu ); } printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth ); diff --git a/src/setup_generic.h b/src/setup_generic.h index 6d0ae49..c2926a2 100644 --- a/src/setup_generic.h +++ b/src/setup_generic.h @@ -26,7 +26,7 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *threading ); void coarse_grid_correction_PRECISION_free( level_struct *l ); - void interpolation_PRECISION_define( vector_double* V, level_struct *l, struct Thread *threading ); + void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct Thread *threading ); void iterative_PRECISION_setup( int setup_iter, level_struct *l, struct Thread *threading ); void re_setup_PRECISION( level_struct *l, struct Thread *threading ); void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ); diff --git a/src/sse_coarse_operator_generic.c b/src/sse_coarse_operator_generic.c index cf3b73d..cde7a51 100644 --- a/src/sse_coarse_operator_generic.c +++ b/src/sse_coarse_operator_generic.c @@ -941,8 +941,8 @@ void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION int D_site_offset = 4*n*n; int D_link_offset = n*n; - vector_PRECISION_define( eta1, 0, 0, n*offset, l ); - vector_PRECISION_define( eta2, 0, 0, n*offset, l ); + buffer_PRECISION_define( eta1, 0, 0, n*offset, l ); + buffer_PRECISION_define( eta2, 0, 0, n*offset, l ); // requires the positive boundaries of phi to be communicated before index_fw = neighbor[5*site+1 + mu]; diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h index fb7391a..e51f44a 100644 --- a/src/sse_coarse_operator_generic.h +++ b/src/sse_coarse_operator_generic.h @@ -73,7 +73,7 @@ int site); - static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, OPERATOR_TYPE_PRECISION *D, level_struct *l ) { #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION int nv = l->num_parent_eig_vect; @@ -81,7 +81,7 @@ cgenmv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta); #endif } - static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, OPERATOR_TYPE_PRECISION *D, level_struct *l ) { #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION int nv = l->num_parent_eig_vect; @@ -90,7 +90,7 @@ #endif } - static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l ) { #ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION int site_size = l->num_lattice_site_var; @@ -102,7 +102,7 @@ #endif for(int i=start; ivector_buffer[i*site_size+j] = 0.0; cgemv(site_size, clover+i*2*site_size*lda, lda, (float *)(phi+i*site_size), (float *)(eta+i*site_size)); } #endif diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c index bd5f56a..d51309c 100644 --- a/src/sse_interpolation_generic.c +++ b/src/sse_interpolation_generic.c @@ -28,38 +28,38 @@ void interpolation_PRECISION_alloc( level_struct *l ) { int k, n = l->num_eig_vect; MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n ); + MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); l->is_PRECISION.interpolation[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 128 ); + MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0]->vector_buffer, complex_PRECISION, n*l->vector_size, 128 ); for ( k=1; kis_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size; + l->is_PRECISION.interpolation[k]->vector_buffer = l->is_PRECISION.interpolation[0]->vector_buffer + k*l->vector_size; #endif // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 ); - l->is_PRECISION.test_vector[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 ); + vector_PRECISION_init(&(l->is_PRECISION.test_vector[0])); + MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size, 128 ); for ( k=1; kis_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size; + l->is_PRECISION.test_vector[k].vector_buffer = l->is_PRECISION.test_vector[0].vector_buffer + k*l->inner_vector_size; } } void interpolation_PRECISION_dummy_alloc( level_struct *l ) { - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); + MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); + MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); } void interpolation_PRECISION_dummy_free( level_struct *l ) { - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); + FREE( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); + FREE( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); } @@ -67,12 +67,12 @@ void interpolation_PRECISION_free( level_struct *l ) { int n = l->num_eig_vect; - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size ); + FREE_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size ); FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n ); + FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n ); + FREE_HUGEPAGES( l->is_PRECISION.interpolation[0]->vector_buffer, complex_PRECISION, n*l->vector_size ); + FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); #endif FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); } @@ -131,23 +131,23 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, } -void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) { +void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _PR, threading ); int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); + vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; @@ -180,7 +180,7 @@ void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_ int offset = SIMD_LENGTH_PRECISION; // loop over blocks of SIMD_LENGTH_PRECISION vectors for ( j=0; jvector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; __m128 zero = _mm_setzero_ps(); @@ -266,7 +266,7 @@ void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_ int offset = SIMD_LENGTH_PRECISION; // loop over blocks of SIMD_LENGTH_PRECISION vectors for ( j=0; jvector_buffer + i*2*num_parent_eig_vect*aggregate_sites; operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; + START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); + vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; @@ -365,7 +365,7 @@ void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level int offset = SIMD_LENGTH_PRECISION; // loop over blocks of SIMD_LENGTH_PRECISION vectors for ( j=0; jvector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; __m128 zero = _mm_setzero_ps(); @@ -453,7 +453,7 @@ void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level int offset = SIMD_LENGTH_PRECISION; // loop over blocks of SIMD_LENGTH_PRECISION vectors for ( j=0; jvector_buffer + i*2*num_parent_eig_vect*aggregate_sites; operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { int offset = SIMD_LENGTH_PRECISION; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; // loop over blocks of SIMD_LENGTH_PRECISION vectors for ( j=0; jvector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving @@ -604,8 +604,8 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str int offset = SIMD_LENGTH_PRECISION; // loop over blocks of SIMD_LENGTH_PRECISION vectors for ( j=0; jnext_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving @@ -661,7 +661,7 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str SYNC_HYPERTHREADS(threading) START_LOCKED_MASTER(threading) - vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level ); + vector_PRECISION_gather( phi_c, &(l->next_level->gs_PRECISION.transfer_buffer), l->next_level ); END_LOCKED_MASTER(threading) PROF_PRECISION_STOP( _PR, 1, threading ); } diff --git a/src/sse_interpolation_generic.h b/src/sse_interpolation_generic.h index 2db7a86..14eb693 100644 --- a/src/sse_interpolation_generic.h +++ b/src/sse_interpolation_generic.h @@ -28,9 +28,9 @@ void interpolation_PRECISION_dummy_alloc( level_struct *l ); void interpolation_PRECISION_dummy_free( level_struct *l ); - void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); - void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading ); + void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, Thread *threading ); + void interpolate3_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, Thread *threading ); + void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_struct *l, Thread *threading ); #endif #endif \ No newline at end of file diff --git a/src/sse_linalg.c b/src/sse_linalg.c index bf0f9d6..2e1bc45 100644 --- a/src/sse_linalg.c +++ b/src/sse_linalg.c @@ -24,7 +24,7 @@ #ifdef SSE #ifdef OPTIMIZED_LINALG_double -void vector_double_scale( vector_double z, vector_double x, complex_double alpha, int start, int end, level_struct *l ) { +void vector_double_scale( vector_double *z, vector_double *x, complex_double alpha, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -32,8 +32,8 @@ void vector_double_scale( vector_double z, vector_double x, complex_double alpha __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - double *zd = (double*)(z+start); - double *xd = (double*)(x+start); + double *zd = (double*)(z->vector_buffer+start); + double *xd = (double*)(x->vector_buffer+start); for( int i=start; ivector_buffer[i]), i++, l ); // sum over cores START_NO_HYPERTHREADS(threading) @@ -380,7 +380,7 @@ double global_norm_double( vector_double x, int start, int end, level_struct *l, #endif #ifdef OPTIMIZED_LINALG_float -float global_norm_float( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) { +float global_norm_float( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_float_START( _GIP, threading ); @@ -449,7 +449,7 @@ float global_norm_float( vector_float x, int start, int end, level_struct *l, st #endif #ifdef OPTIMIZED_LINALG_double -void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha, +void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_double *alpha, int sign, int count, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); @@ -471,7 +471,7 @@ void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_doubl FOR12( { __m128d z_re = _mm_loadu_pd( (double*)(z+i) ); - __m128d V_re = _mm_loadu_pd( (double*)(V[c]+i) ); + __m128d V_re = _mm_loadu_pd( (double*)(V[c].vector_buffer+i) ); z_re = sse_fmadd_pd( alpha_re[c], V_re, z_re ); _mm_storeu_pd( (double*)(z+i), z_re ); i++; @@ -486,7 +486,7 @@ void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_doubl { __m128d z_re; __m128d z_im; __m128d V_re; __m128d V_im; sse_complex_deinterleaved_load_pd( (double*)(z+i), &z_re, &z_im ); - sse_complex_deinterleaved_load_pd( (double*)(V[c]+i), &V_re, &V_im ); + sse_complex_deinterleaved_load_pd( (double*)(V[c].vector_buffer+i), &V_re, &V_im ); cfmadd_pd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); sse_complex_interleaved_store_pd( z_re, z_im, (double*)(z+i) ); i += SIMD_LENGTH_double; @@ -502,7 +502,7 @@ void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_doubl #endif #ifdef OPTIMIZED_LINALG_float -void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha, +void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float *alpha, int sign, int count, int start, int end, level_struct *l ) { __m128 V_re; __m128 V_im; @@ -528,7 +528,7 @@ void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *a FOR6( { z_re = _mm_loadu_ps( (float*)(z+i) ); - V_re = _mm_loadu_ps( (float*)(V[c]+i) ); + V_re = _mm_loadu_ps( (float*)(V[c].vector_buffer+i) ); z_re = sse_fmadd( alpha_re[c], V_re, z_re ); _mm_storeu_ps( (float*)(z+i), z_re ); i+=2; @@ -542,7 +542,7 @@ void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *a FOR3( { sse_complex_deinterleaved_load( (float*)(z+i), &z_re, &z_im ); - sse_complex_deinterleaved_load( (float*)(V[c]+i), &V_re, &V_im ); + sse_complex_deinterleaved_load( (float*)(V[c].vector_buffer+i), &V_re, &V_im ); cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) ); i+=SIMD_LENGTH_float; @@ -556,7 +556,7 @@ void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *a for ( int c=0; cthread != 0) @@ -39,7 +39,7 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) vector_double_define( rhs, 0, start, end, l ); if ( g.my_rank == 0 ) { START_LOCKED_MASTER(threading) - rhs[0] = 1.0; + rhs->vector_buffer[0] = 1.0; END_LOCKED_MASTER(threading) } START_MASTER(threading) @@ -62,7 +62,7 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) } -int wilson_driver( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) { +int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) { int iter = 0, start = threading->start_index[l->depth], end = threading->end_index[l->depth]; @@ -79,7 +79,7 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l double tmp_t = -MPI_Wtime(); #endif - vector_double_copy( rhs, source, start, end, l ); + vector_double_copy( &rhs, source, start, end, l ); if ( g.method == -1 ) { cgn_double( &(g.p), l, threading ); } else if ( g.mixed_precision == 2 ) { @@ -87,7 +87,7 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l } else { iter = fgmres_double( &(g.p), l, threading ); } - vector_double_copy( solution, sol, start, end, l ); + vector_double_copy( solution, &sol, start, end, l ); #ifdef WILSON_BENCHMARK tmp_t += MPI_Wtime(); if ( tmp_t < t_min ) @@ -105,13 +105,13 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l } -void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) { +void solve( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) { if ( g.vt.evaluation ) { vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b; // this would yield different results if we threaded it, so we don't START_LOCKED_MASTER(threading) - vector_double_define_random( rhs, 0, l->inner_vector_size, l ); + vector_double_define_random( &rhs, 0, l->inner_vector_size, l ); scan_var( &(g.vt), l ); END_LOCKED_MASTER(threading) } else { @@ -122,8 +122,11 @@ void solve( vector_double solution, vector_double source, level_struct *l, struc void solve_driver( level_struct *l, struct Thread *threading ) { - vector_double solution = NULL, source = NULL; + vector_double solution, source; double minus_twisted_bc[4], norm; + + vector_double_init(&solution); + vector_double_init(&source); if(g.bc==2) for ( int i=0; i<4; i++ ) @@ -135,15 +138,15 @@ void solve_driver( level_struct *l, struct Thread *threading ) { printf0("inverting doublet operator\n"); } #endif - PUBLIC_MALLOC( solution, complex_double, l->inner_vector_size ); - PUBLIC_MALLOC( source, complex_double, l->inner_vector_size ); + PUBLIC_MALLOC( solution.vector_buffer, complex_double, l->inner_vector_size ); + PUBLIC_MALLOC( source.vector_buffer, complex_double, l->inner_vector_size ); - rhs_define( source, l, threading ); + rhs_define( &source, l, threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( source, source, g.twisted_bc, l); + apply_twisted_bc_to_vector_double( &source, &source, g.twisted_bc, l); - norm = global_norm_double( source, 0, l->inner_vector_size, l, threading ); + norm = global_norm_double( &source, 0, l->inner_vector_size, l, threading ); printf0("source vector norm: %le\n",norm); #ifdef HAVE_TM1p1 @@ -157,10 +160,10 @@ void solve_driver( level_struct *l, struct Thread *threading ) { printf0("\n\n+--------------------------- up ---------------------------+\n\n"); END_MASTER(threading) - solve( solution, source, l, threading ); + solve( &solution, &source, l, threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); + apply_twisted_bc_to_vector_double( &solution, &solution, minus_twisted_bc, l); START_LOCKED_MASTER(threading) printf0("\n\n+-------------------------- down --------------------------+\n\n"); @@ -174,16 +177,16 @@ void solve_driver( level_struct *l, struct Thread *threading ) { } #endif - solve( solution, source, l, threading ); + solve( &solution, &source, l, threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); + apply_twisted_bc_to_vector_double( &solution, &solution, minus_twisted_bc, l); - norm = global_norm_double( solution, 0, l->inner_vector_size, l, threading ); + norm = global_norm_double( &solution, 0, l->inner_vector_size, l, threading ); printf0("solution vector norm: %le\n",norm); - PUBLIC_FREE( solution, complex_double, l->inner_vector_size ); - PUBLIC_FREE( source, complex_double, l->inner_vector_size ); + PUBLIC_FREE( solution.vector_buffer, complex_double, l->inner_vector_size ); + PUBLIC_FREE( source.vector_buffer, complex_double, l->inner_vector_size ); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) diff --git a/src/top_level.h b/src/top_level.h index cc4b029..a281daa 100644 --- a/src/top_level.h +++ b/src/top_level.h @@ -24,9 +24,9 @@ struct Thread; - void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ); - int wilson_driver( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ); - void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ); + void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ); + int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ); + void solve( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ); void solve_driver( level_struct *l, struct Thread *threading ); #endif diff --git a/src/var_table.h b/src/var_table.h index abb321c..bc9fa36 100644 --- a/src/var_table.h +++ b/src/var_table.h @@ -33,17 +33,17 @@ warning0("SCAN_VAR does not support threading, yet.\n"); \ kind *tmp_var = (kind*)(var_pt); \ kind signum = (start_valinner_vector_size ); \ + MALLOC( v->vector_buffer, complex_double, l->inner_vector_size ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ - vector_double_copy( v, x, 0, l->inner_vector_size, l ); \ + vector_double_copy( v, &x, 0, l->inner_vector_size, l ); \ norm_v = global_norm_double( v, 0, l->inner_vector_size, l, no_threading ); \ } \ \ @@ -68,32 +68,32 @@ } \ printf0("scanning variable \"%s\", value: %lf, run %d of %d\n", name, (double)(*tmp_var), i+1, g.vt.average_over ); \ if ( g.vt.track_error ) { \ - apply_operator_double( b, v, &(g.p), l, no_threading ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + apply_operator_double( &b, v, &(g.p), l, no_threading ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ if ( g.vt.track_cgn_error ) { \ ASSERT( g.method >=0 && g.p.restart_length >= 4 ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ cgn_double( &(g.p), l, no_threading ); \ - vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \ - g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ + vector_double_minus( &x, &x, v, 0, l->inner_vector_size, l ); \ + g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ printf0("CGN: error norm: %le\n", g.vt.p_end->values[_CGNR_ERR] ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ } \ } else {\ - rhs_define( b, l, no_threading );\ + rhs_define( &b, l, no_threading );\ } \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ if ( i == g.vt.average_over-1 ) prof_print( l ); \ if ( g.vt.track_error ) { \ - vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \ - g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ + vector_double_minus( &x, &x, v, 0, l->inner_vector_size, l ); \ + g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ } \ } \ } \ if ( g.vt.track_error ) { \ - FREE( v, complex_double, l->inner_vector_size ); \ + FREE( v->vector_buffer, complex_double, l->inner_vector_size ); \ } \ tt1 = MPI_Wtime(); \ printf0("\n\ntotal time for parameter scan: %d minutes and %d seconds\n", \ diff --git a/src/vcycle_generic.c b/src/vcycle_generic.c index 038a8fa..cfabc5b 100644 --- a/src/vcycle_generic.c +++ b/src/vcycle_generic.c @@ -22,10 +22,10 @@ #include "main.h" #include "vcycle_PRECISION.h" -void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, +void smoother_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int n, const int res, level_struct *l, struct Thread *threading ) { - ASSERT( phi != eta ); + ASSERT( phi->vector_buffer != eta->vector_buffer ); START_MASTER(threading); PROF_PRECISION_START( _SM ); @@ -47,10 +47,10 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE if ( g.method == 4 || g.method == 6 ) { if ( g.odd_even ) { if ( res == _RES ) { - apply_operator_PRECISION( l->sp_PRECISION.x, phi, &(l->p_PRECISION), l, threading ); - vector_PRECISION_minus( l->sp_PRECISION.x, eta, l->sp_PRECISION.x, start, end, l ); + apply_operator_PRECISION( &(l->sp_PRECISION.x), phi, &(l->p_PRECISION), l, threading ); + vector_PRECISION_minus( &(l->sp_PRECISION.x), eta, &(l->sp_PRECISION.x), start, end, l ); } - block_to_oddeven_PRECISION( l->sp_PRECISION.b, res==_RES?l->sp_PRECISION.x:eta, l, threading ); + block_to_oddeven_PRECISION( &(l->sp_PRECISION.b), res==_RES?&(l->sp_PRECISION.x):eta, l, threading ); START_LOCKED_MASTER(threading) l->sp_PRECISION.initial_guess_zero = _NO_RES; END_LOCKED_MASTER(threading) @@ -62,21 +62,21 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE else coarse_solve_odd_even_PRECISION( &(l->sp_PRECISION), &(l->oe_op_PRECISION), l, threading ); } if ( res == _NO_RES ) { - oddeven_to_block_PRECISION( phi, l->sp_PRECISION.x, l, threading ); + oddeven_to_block_PRECISION( phi, &(l->sp_PRECISION.x), l, threading ); } else { - oddeven_to_block_PRECISION( l->sp_PRECISION.b, l->sp_PRECISION.x, l, threading ); - vector_PRECISION_plus( phi, phi, l->sp_PRECISION.b, start, end, l ); + oddeven_to_block_PRECISION( &(l->sp_PRECISION.b), &(l->sp_PRECISION.x), l, threading ); + vector_PRECISION_plus( phi, phi, &(l->sp_PRECISION.b), start, end, l ); } } else { START_LOCKED_MASTER(threading) - l->sp_PRECISION.x = phi; l->sp_PRECISION.b = eta; + l->sp_PRECISION.x = *phi; l->sp_PRECISION.b = *eta; END_LOCKED_MASTER(threading) fgmres_PRECISION( &(l->sp_PRECISION), l, threading ); } } else if ( g.method == 5 ) { - vector_PRECISION_copy( l->sp_PRECISION.b, eta, start, end, l ); + vector_PRECISION_copy( &(l->sp_PRECISION.b), eta, start, end, l ); bicgstab_PRECISION( &(l->sp_PRECISION), l, threading ); - vector_PRECISION_copy( phi, l->sp_PRECISION.x, start, end, l ); + vector_PRECISION_copy( phi, &(l->sp_PRECISION.x), start, end, l ); } ASSERT( Dphi == NULL ); } @@ -87,19 +87,19 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE } -void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, +void vcycle_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int res, level_struct *l, struct Thread *threading ) { if ( g.interpolation && l->level>0 ) { for ( int i=0; in_cy; i++ ) { if ( i==0 && res == _NO_RES ) { - restrict_PRECISION( l->next_level->p_PRECISION.b, eta, l, threading ); + restrict_PRECISION( &(l->next_level->p_PRECISION.b), eta, l, threading ); } else { int start = threading->start_index[l->depth]; int end = threading->end_index[l->depth]; - apply_operator_PRECISION( l->vbuf_PRECISION[2], phi, &(l->p_PRECISION), l, threading ); - vector_PRECISION_minus( l->vbuf_PRECISION[3], eta, l->vbuf_PRECISION[2], start, end, l ); - restrict_PRECISION( l->next_level->p_PRECISION.b, l->vbuf_PRECISION[3], l, threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[2]), phi, &(l->p_PRECISION), l, threading ); + vector_PRECISION_minus( &(l->vbuf_PRECISION[3]), eta, &(l->vbuf_PRECISION[2]), start, end, l ); + restrict_PRECISION( &(l->next_level->p_PRECISION.b), &(l->vbuf_PRECISION[3]), l, threading ); } if ( !l->next_level->idle ) { START_MASTER(threading) @@ -110,7 +110,7 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI if ( g.kcycle ) fgmres_PRECISION( &(l->next_level->p_PRECISION), l->next_level, threading ); else - vcycle_PRECISION( l->next_level->p_PRECISION.x, NULL, l->next_level->p_PRECISION.b, _NO_RES, l->next_level, threading ); + vcycle_PRECISION( &(l->next_level->p_PRECISION.x), NULL, &(l->next_level->p_PRECISION.b), _NO_RES, l->next_level, threading ); } else { if ( g.odd_even ) { if ( g.method == 6 ) { @@ -128,9 +128,9 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI END_MASTER(threading) } if( i == 0 && res == _NO_RES ) - interpolate3_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); + interpolate3_PRECISION( phi, &(l->next_level->p_PRECISION.x), l, threading ); else - interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); + interpolate_PRECISION( phi, &(l->next_level->p_PRECISION.x), l, threading ); smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, l, threading ); res = _RES; } diff --git a/src/vcycle_generic.h b/src/vcycle_generic.h index 5e54a74..8c251f6 100644 --- a/src/vcycle_generic.h +++ b/src/vcycle_generic.h @@ -32,10 +32,10 @@ #include "threading.h" #include "solver_analysis.h" - void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, + void smoother_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int n, const int res, level_struct *l, struct Thread *threading ); - void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, + void vcycle_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int res, level_struct *l, struct Thread *threading ); #endif diff --git a/src/vector_generic.c b/src/vector_generic.c new file mode 100644 index 0000000..081cc4a --- /dev/null +++ b/src/vector_generic.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#include "main.h" + +void vector_PRECISION_init( vector_PRECISION *vec ) { + + vec->vector_buffer = NULL; +} + +/*void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ) { + + MALLOC( vec->vector_buffer, complex_PRECISION, num_vect ); +} + +void vector_PRECISION_free( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ) { + + FREE( vec->vector_buffer, complex_PRECISION, num_vect ); +} +*/ + +// vector storage for PRECISION precision +void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) { + + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _SET ); + if ( phi->vector_buffer != NULL ) { + int i; + for ( i=start; ivector_buffer[i] = value; + } else { + error0("Error in \"vector_PRECISION_define\": pointer is null\n"); + } + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _SET, 1 ); +} + + +void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, + int start, int end, level_struct *l ) { + + PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); + int r_start = 2*start, r_end = 2*end; + + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA2 ); + + REAL_VECTOR_FOR( int i=r_start, iinner_vector_size ); +} + + +void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ) { + + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _CPY ); + + VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i], i++, l ); + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); +} +/* +void vector_PRECISION_test_routine( vector_PRECISION *vec, level_struct *l, struct Thread *threading ) { + +}*/ diff --git a/src/vector_generic.h b/src/vector_generic.h new file mode 100644 index 0000000..26a3970 --- /dev/null +++ b/src/vector_generic.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#ifndef VECTOR_PRECISION_HEADER + #define VECTOR_PRECISION_HEADER + + struct Thread; + + void vector_PRECISION_init( vector_PRECISION *vec ); + // void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ); + void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); + void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, + int start, int end, level_struct *l ); + void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x + // void vector_PRECISION_free( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ); + + // void vector_PRECISION_test_routine( vector_PRECISION *vec, level_struct *l, struct Thread *threading ); + +#endif From c01dde50c594d741ab240403d47af22db9bad761 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 11:10:32 +0300 Subject: [PATCH 03/31] Fixing vector_buffer incrementation --- src/coarse_oddeven_generic.c | 29 ++-- src/coarse_operator_generic.c | 174 ++++++++++--------- src/coarse_operator_generic.h | 316 +++++++++++++++++----------------- src/dirac_generic.c | 253 +++++++++++++-------------- 4 files changed, 389 insertions(+), 383 deletions(-) diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c index 615ba9d..f6d9441 100644 --- a/src/coarse_oddeven_generic.c +++ b/src/coarse_oddeven_generic.c @@ -309,6 +309,8 @@ void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; + vector_PRECISION x_pt, y_pt; + #ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION int num_site_var=l->num_lattice_site_var, oo_inv_size = SQUARE(num_site_var); @@ -317,23 +319,23 @@ void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato #else config_PRECISION sc = op->clover_oo_inv; #endif - + compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); - x->vector_buffer += num_site_var*(op->num_even_sites+start); - y->vector_buffer += num_site_var*(op->num_even_sites+start); + x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start); + y_pt.vector_buffer = y->vector_buffer + num_site_var*(op->num_even_sites+start); sc += oo_inv_size*start; for ( int i=start; ivector_buffer += num_site_var; - y->vector_buffer += num_site_var; + x_pt.vector_buffer += num_site_var; + y_pt.vector_buffer += num_site_var; sc += oo_inv_size; } #else compute_core_start_end_custom( op->num_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 ); - coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l ); + coarse_self_couplings_PRECISION_vectorized( &y_pt, &x_pt, op, start, end, l ); #endif } @@ -347,6 +349,7 @@ void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, ope level_struct *l, struct Thread *threading ) { int start, end; + vector_PRECISION x_pt, y_pt; compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); // odd sites @@ -369,20 +372,20 @@ void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, ope #endif #endif - x->vector_buffer += num_site_var*(op->num_even_sites+start); - y->vector_buffer += num_site_var*(op->num_even_sites+start); + x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start); + y_pt.vector_buffer = y->vector_buffer + num_site_var*(op->num_even_sites+start); sc += oo_inv_size*start; for ( int i=start; ivector_buffer[j] = _COMPLEX_PRECISION_ZERO; - cgemv( num_site_var, sc, lda, (float *)x, (float *)y); + y_pt.vector_buffer[j] = _COMPLEX_PRECISION_ZERO; + cgemv( num_site_var, sc, lda, (float *)&x_pt, (float *)&y_pt); #endif - x->vector_buffer += num_site_var; - y->vector_buffer += num_site_var; + x_pt.vector_buffer += num_site_var; + y_pt.vector_buffer += num_site_var; sc += oo_inv_size; } } diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index b7f8c53..d316d0d 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -607,30 +607,29 @@ void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { int j, k=l->num_lattice_site_var/2; - vector_PRECISION eta_end; + buffer_PRECISION eta_end, eta_pt, phi_pt; + eta_end = eta->vector_buffer + end; + phi_pt = phi->vector_buffer + start; + eta_pt = eta->vector_buffer + start; - eta_end.vector_buffer = eta->vector_buffer+end; - phi->vector_buffer += start; - eta->vector_buffer += start; - - if ( eta->vector_buffer != phi->vector_buffer ) { - while ( eta->vector_buffer < eta_end.vector_buffer ) { + if ( eta_pt != phi_pt ) { + while ( eta_pt < eta_end ) { for ( j=0; jvector_buffer = -(*phi->vector_buffer); - eta->vector_buffer++; phi->vector_buffer++; + *eta_pt = -(*phi_pt); + eta_pt++; phi_pt++; } for ( j=0; jvector_buffer = *phi->vector_buffer; - eta->vector_buffer++; phi->vector_buffer++; + *eta_pt = *phi_pt; + eta_pt++; phi_pt++; } } } else { - while ( eta->vector_buffer < eta_end.vector_buffer ) { + while ( eta_pt < eta_end ) { for ( j=0; jvector_buffer = -(*eta->vector_buffer); - eta->vector_buffer++; + *eta_pt = -(*eta_pt); + eta_pt++; } - eta->vector_buffer+=k; + eta_pt+=k; } } } @@ -640,35 +639,35 @@ void coarse_tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int j, k=l->num_lattice_site_var/4; - vector_PRECISION eta_end; + buffer_PRECISION eta_end, phi_pt, eta_pt; - eta_end.vector_buffer = eta->vector_buffer+end; - phi->vector_buffer += start; - eta->vector_buffer += start; + eta_end = eta->vector_buffer + end; + phi_pt = phi->vector_buffer + start; + eta_pt = eta->vector_buffer + start; - ASSERT( eta->vector_buffer != phi->vector_buffer ); - while ( eta->vector_buffer < eta_end.vector_buffer ) { - phi->vector_buffer += k; + ASSERT( eta_pt != phi_pt ); + while ( eta_pt < eta_end ) { + phi_pt += k; for ( j=0; jvector_buffer = -(*phi->vector_buffer); - eta->vector_buffer++; phi->vector_buffer++; + *eta_pt = -(*phi_pt); + eta_pt++; phi_pt++; } - phi->vector_buffer -= 2*k; + phi_pt -= 2*k; for ( j=0; jvector_buffer = -(*phi->vector_buffer); - eta->vector_buffer++; phi->vector_buffer++; + *eta_pt = -(*phi_pt); + eta_pt++; phi_pt++; } - phi->vector_buffer += 2*k; + phi_pt += 2*k; for ( j=0; jvector_buffer = *phi->vector_buffer; - eta->vector_buffer++; phi->vector_buffer++; + *eta_pt = *phi_pt; + eta_pt++; phi_pt++; } - phi->vector_buffer -= 2*k; + phi_pt -= 2*k; for ( j=0; jvector_buffer = *phi->vector_buffer; - eta->vector_buffer++; phi->vector_buffer++; + *eta_pt = *phi_pt; + eta_pt++; phi_pt++; } - phi->vector_buffer += k; + phi_pt += k; } } else #endif @@ -730,15 +729,18 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr int vs = l->vector_size, ivs = l->inner_vector_size, cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size; PRECISION diff = 0; - vector_PRECISION *vp1=NULL, vp2, vp3, vp4, *vc1=NULL, vc2, vc3; + vector_PRECISION vp1, vp2, vp3, vp4, vc1, vc2, vc3; + + vector_PRECISION_init(&vp1); + vector_PRECISION_init(&vc1); - PUBLIC_MALLOC( vp1->vector_buffer, complex_PRECISION, 4*vs ); - PUBLIC_MALLOC( vc1->vector_buffer, complex_PRECISION, 3*cvs ); + PUBLIC_MALLOC( vp1.vector_buffer, complex_PRECISION, 4*vs ); + PUBLIC_MALLOC( vc1.vector_buffer, complex_PRECISION, 3*cvs ); SYNC_MASTER_TO_ALL(threading) - vp2.vector_buffer = vp1->vector_buffer + vs; vp3.vector_buffer = vp2.vector_buffer + vs; vp4.vector_buffer = vp3.vector_buffer + vs; - vc2.vector_buffer = vc1->vector_buffer + cvs; vc3.vector_buffer = vc2.vector_buffer + cvs; + vp2.vector_buffer = vp1.vector_buffer + vs; vp3.vector_buffer = vp2.vector_buffer + vs; vp4.vector_buffer = vp3.vector_buffer + vs; + vc2.vector_buffer = vc1.vector_buffer + cvs; vc3.vector_buffer = vc2.vector_buffer + cvs; START_LOCKED_MASTER(threading) #ifdef HAVE_TM1p1 @@ -763,33 +765,33 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr } if ( !l->next_level->idle ) - vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); - vector_PRECISION_distribute( &vc2, vc1, l->next_level ); + vector_PRECISION_define_random( &vc1, 0, civs, l->next_level ); + vector_PRECISION_distribute( &vc2, &vc1, l->next_level ); vector_PRECISION_gather( &vc3, &vc2, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc2, vc1, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc2, &vc1, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); } test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff ); if ( !l->next_level->idle ) - vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); - interpolate3_PRECISION( vp1, vc1, l, no_threading ); - restrict_PRECISION( &vc2, vp1, l, no_threading ); + vector_PRECISION_define_random( &vc1, 0, civs, l->next_level ); + interpolate3_PRECISION( &vp1, &vc1, l, no_threading ); + restrict_PRECISION( &vc2, &vp1, l, no_threading ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc3, vc1, &vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc3, &vc1, &vc2, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) ); } END_LOCKED_MASTER(threading) if(threading->n_core>1) { - interpolate3_PRECISION( vp1, vc1, l, threading ); - restrict_PRECISION( &vc2, vp1, l, threading ); + interpolate3_PRECISION( &vp1, &vc1, l, threading ); + restrict_PRECISION( &vc2, &vp1, l, threading ); START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc3, vc1, &vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc3, &vc1, &vc2, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -797,27 +799,27 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if (l->depth==0) - gamma5_PRECISION( &vp2, vp1, l, no_threading ); + gamma5_PRECISION( &vp2, &vp1, l, no_threading ); else - coarse_gamma5_PRECISION( &vp2, vp1, 0, ivs, l ); + coarse_gamma5_PRECISION( &vp2, &vp1, 0, ivs, l ); restrict_PRECISION( &vc2, &vp2, l, no_threading ); coarse_gamma5_PRECISION( &vc3, &vc2, 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc2, vc1, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc2, &vc1, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } #ifdef HAVE_TM1p1 if(g.n_flavours == 2) { if (l->depth==0) - tau1_gamma5_PRECISION( &vp2, vp1, l, no_threading ); + tau1_gamma5_PRECISION( &vp2, &vp1, l, no_threading ); else - coarse_tau1_gamma5_PRECISION( &vp2, vp1, 0, ivs, l ); + coarse_tau1_gamma5_PRECISION( &vp2, &vp1, 0, ivs, l ); restrict_PRECISION( &vc2, &vp2, l, no_threading ); coarse_tau1_gamma5_PRECISION( &vc3, &vc2, 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc2, vc1, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc2, &vc1, &vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } } @@ -827,14 +829,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) vector_PRECISION_define( &vp2, 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.odd_proj, ivs ); + add_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.odd_proj, ivs ); else - coarse_add_block_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l ); + coarse_add_block_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.odd_proj, ivs, l ); restrict_PRECISION( &vc2, &vp2, l, no_threading ); vector_PRECISION_scale( &vc2, &vc2, -1.0, 0, civs, l->next_level ); - coarse_add_block_diagonal_PRECISION( &vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + coarse_add_block_diagonal_PRECISION( &vc2, &vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) @@ -843,14 +845,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { vector_PRECISION_define( &vp2, 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.tm_term, ivs ); + add_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( &vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l ); + coarse_add_anti_block_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.tm_term, ivs, l ); restrict_PRECISION( &vc2, &vp2, l, no_threading ); vector_PRECISION_scale( &vc2, &vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); - coarse_add_anti_block_diagonal_PRECISION( &vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + coarse_add_anti_block_diagonal_PRECISION( &vc2, &vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -862,14 +864,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) { vector_PRECISION_define( &vp2, 0, 0, ivs, l ); if (l->depth==0) - apply_doublet_coupling_PRECISION( &vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs ); + apply_doublet_coupling_PRECISION( &vp2, &vp1, l->s_PRECISION.op.epsbar_term, ivs ); else - coarse_add_doublet_coupling_PRECISION( &vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); + coarse_add_doublet_coupling_PRECISION( &vp2, &vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); restrict_PRECISION( &vc2, &vp2, l, no_threading ); vector_PRECISION_scale( &vc2, &vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); - coarse_add_doublet_coupling_PRECISION( &vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + coarse_add_doublet_coupling_PRECISION( &vc2, &vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -877,14 +879,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if ( l->level > 0 ) { START_LOCKED_MASTER(threading) - interpolate3_PRECISION( vp1, vc1, l, no_threading ); + interpolate3_PRECISION( &vp1, &vc1, l, no_threading ); - apply_operator_PRECISION( &vp2, vp1, &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); #ifdef HAVE_TM if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) { - vector_PRECISION_scale( &vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); + vector_PRECISION_scale( &vp3, &vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); if(l->depth == 0) add_diagonal_PRECISION( &vp2, &vp3, l->p_PRECISION.op->tm_term, ivs ); else @@ -895,9 +897,9 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( &vc3, vc1, l->next_level, no_threading ); + coarse_odd_even_PRECISION_test( &vc3, &vc1, l->next_level, no_threading ); else - apply_operator_PRECISION( &vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); + apply_operator_PRECISION( &vc3, &vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); vector_PRECISION_minus( &vc3, &vc2, &vc3, 0, civs, l->next_level ); diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ); @@ -913,9 +915,9 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if(threading->n_core>1) { if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( &vc3, vc1, l->next_level, threading ); + coarse_odd_even_PRECISION_test( &vc3, &vc1, l->next_level, threading ); else - apply_operator_PRECISION( &vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, threading ); + apply_operator_PRECISION( &vc3, &vc1, &(l->next_level->p_PRECISION), l->next_level, threading ); } START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { @@ -933,28 +935,28 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) { - vector_PRECISION_define_random( vp1, 0, ivs, l ); - block_to_oddeven_PRECISION( &vp4, vp1, l, no_threading ); + vector_PRECISION_define_random( &vp1, 0, ivs, l ); + block_to_oddeven_PRECISION( &vp4, &vp1, l, no_threading ); coarse_diag_ee_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), l, no_threading ); coarse_diag_oo_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), l, no_threading ); coarse_hopping_term_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); oddeven_to_block_PRECISION( &vp4, &vp3, l, no_threading ); - apply_operator_PRECISION( &vp2, vp1, &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); vector_PRECISION_minus( &vp4, &vp4, &vp2, 0, ivs, l ); diff = global_norm_PRECISION( &vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp2, 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff ); - block_to_oddeven_PRECISION( &vp4, vp1, l, no_threading ); + block_to_oddeven_PRECISION( &vp4, &vp1, l, no_threading ); coarse_odd_even_PRECISION_test( &vp3, &vp4, l, no_threading ); oddeven_to_block_PRECISION( &vp4, &vp3, l, no_threading ); - apply_operator_PRECISION( &vp2, vp1, &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); vector_PRECISION_minus( &vp4, &vp4, &vp2, 0, ivs, l ); diff = global_norm_PRECISION( &vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp2, 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff ); } - FREE( vp1->vector_buffer, complex_PRECISION, 4*vs ); - FREE( vc1->vector_buffer, complex_PRECISION, 3*cvs ); + FREE( vp1.vector_buffer, complex_PRECISION, 4*vs ); + FREE( vc1.vector_buffer, complex_PRECISION, 3*cvs ); END_LOCKED_MASTER(threading) if ( g.method != 6 && l->next_level->level > 0 && !l->next_level->idle ) { diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h index 59e3b62..25215a0 100644 --- a/src/coarse_operator_generic.h +++ b/src/coarse_operator_generic.h @@ -365,6 +365,7 @@ int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -373,51 +374,51 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//1 - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta->vector_buffer += num_eig_vect;//2 - phi->vector_buffer -= num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//0 D += num_eig_vect2; - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//1 - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - eta->vector_buffer -= 3*num_eig_vect;//0 - phi->vector_buffer += num_eig_vect;//2 + eta_pt -= 3*num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//3 - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta->vector_buffer += num_eig_vect;//2 - phi->vector_buffer -= num_eig_vect;//2 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//2 D += num_eig_vect2; - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//3 - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta->vector_buffer += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - phi->vector_buffer += num_eig_vect; - eta->vector_buffer -= num_eig_vect; + phi_pt += num_eig_vect; + eta_pt -= num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta->vector_buffer += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif @@ -429,6 +430,7 @@ int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -437,51 +439,51 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A* - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//1 - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - eta->vector_buffer -= num_eig_vect;//0 - phi->vector_buffer += num_eig_vect;//2 + eta_pt -= num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//3 - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta->vector_buffer += num_eig_vect;//2 - phi->vector_buffer -= 3*num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= 3*num_eig_vect;//0 D += num_eig_vect2; - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//1 - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - eta->vector_buffer -= num_eig_vect;//2 - phi->vector_buffer += num_eig_vect;//2 + eta_pt -= num_eig_vect;//2 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//3 - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A* - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta->vector_buffer += num_eig_vect; - phi->vector_buffer -= num_eig_vect; + eta_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif @@ -492,6 +494,7 @@ int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -500,51 +503,51 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//1 - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta->vector_buffer += num_eig_vect;//2 - phi->vector_buffer -= num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//0 D += num_eig_vect2; - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//1 - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - eta->vector_buffer -= 3*num_eig_vect;//0 - phi->vector_buffer += num_eig_vect;//2 + eta_pt -= 3*num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//3 - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta->vector_buffer += num_eig_vect;//2 - phi->vector_buffer -= num_eig_vect;//2 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//2 D += num_eig_vect2; - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//3 - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta->vector_buffer += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - phi->vector_buffer += num_eig_vect; - eta->vector_buffer -= num_eig_vect; + phi_pt += num_eig_vect; + eta_pt -= num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta->vector_buffer += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif @@ -555,6 +558,7 @@ int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -563,51 +567,51 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A* - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//1 - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - eta->vector_buffer -= num_eig_vect;//0 - phi->vector_buffer += num_eig_vect;//2 + eta_pt -= num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//1 - phi->vector_buffer += num_eig_vect;//3 - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta->vector_buffer += num_eig_vect;//2 - phi->vector_buffer -= 3*num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= 3*num_eig_vect;//0 D += num_eig_vect2; - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//1 - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - eta->vector_buffer -= num_eig_vect;//2 - phi->vector_buffer += num_eig_vect;//2 + eta_pt -= num_eig_vect;//2 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); - eta->vector_buffer += num_eig_vect;//3 - phi->vector_buffer += num_eig_vect;//3 - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A* - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta->vector_buffer += num_eig_vect; - phi->vector_buffer -= num_eig_vect; + eta_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif @@ -618,25 +622,26 @@ int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A - mv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // C - eta1->vector_buffer += num_eig_vect; + eta1_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // B - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // D - eta2->vector_buffer += num_eig_vect; + eta2_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } @@ -644,28 +649,29 @@ vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, - num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A* - mvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // -C* - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // -B* - eta1->vector_buffer += num_eig_vect; - phi->vector_buffer -= num_eig_vect; + eta1_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // D* - eta2->vector_buffer += num_eig_vect; - phi->vector_buffer += num_eig_vect; + eta2_pt += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, @@ -673,25 +679,26 @@ int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A - nmv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // C - eta1->vector_buffer += num_eig_vect; + eta1_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // B - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // D - eta2->vector_buffer += num_eig_vect; + eta2_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } @@ -699,28 +706,29 @@ vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, - num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A* - nmvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // -C* - phi->vector_buffer += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // -B* - eta1->vector_buffer += num_eig_vect; - phi->vector_buffer -= num_eig_vect; + eta1_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta1->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + mvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // D* - eta2->vector_buffer += num_eig_vect; - phi->vector_buffer += num_eig_vect; + eta2_pt += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta2->vector_buffer, D, phi->vector_buffer, num_eig_vect ); + nmvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } #endif diff --git a/src/dirac_generic.c b/src/dirac_generic.c index eb9a8cc..49d0705 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -168,17 +168,17 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR static void spin0and1_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { - buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size; + buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer; if ( g.csw == 0.0 ) { - while ( eta->vector_buffer < eta_end ) { - FOR6( *eta->vector_buffer = (*phi->vector_buffer)*(*clover); eta->vector_buffer++; phi->vector_buffer++; clover++; ) - FOR6( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; eta->vector_buffer++; ) - phi->vector_buffer+=6; clover+=6; + while ( leta < eta_end ) { + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ) + FOR6( *leta = _COMPLEX_PRECISION_ZERO; leta++; ) + lphi+=6; clover+=6; } } else { - while ( eta->vector_buffer < eta_end ) { - spin0and1_site_clover_PRECISION( eta->vector_buffer, phi->vector_buffer, clover ); - eta->vector_buffer+=12; phi->vector_buffer+=12; clover+=42; + while ( leta < eta_end ) { + spin0and1_site_clover_PRECISION( leta, lphi, clover ); + leta+=12; lphi+=12; clover+=42; } } } @@ -186,17 +186,17 @@ static void spin0and1_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION static void spin2and3_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { - buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size; + buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer; if ( g.csw == 0.0 ) { - while ( eta->vector_buffer < eta_end ) { - phi->vector_buffer+=6; clover+=6; - FOR6( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; eta->vector_buffer++; ) - FOR6( *eta->vector_buffer = (*phi->vector_buffer)*(*clover); eta->vector_buffer++; phi->vector_buffer++; clover++; ) + while ( leta < eta_end ) { + lphi+=6; clover+=6; + FOR6( *leta = _COMPLEX_PRECISION_ZERO; leta++; ) + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ) } } else { - while ( eta->vector_buffer < eta_end ) { - spin2and3_site_clover_PRECISION( eta->vector_buffer, phi->vector_buffer, clover ); - eta->vector_buffer +=12; phi->vector_buffer+=12; clover+=42; + while ( leta < eta_end ) { + spin2and3_site_clover_PRECISION( leta, lphi, clover ); + leta +=12; lphi+=12; clover+=42; } } } @@ -623,19 +623,18 @@ void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struc ASSERT(l->depth == 0); buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - while ( eta->vector_buffer < eta_end ) { - FOR12( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) - FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) + while ( leta < eta_end ) { + FOR12( *leta = -(*lphi); lphi++; leta++; ) + FOR12( *leta = (*lphi); lphi++; leta++; ) } } else #endif - while ( eta->vector_buffer < eta_end ) { - FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) - FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ) + while ( leta < eta_end ) { + FOR6( *leta = -(*lphi); lphi++; leta++; ) + FOR6( *leta = (*lphi); lphi++; leta++; ) } } @@ -646,20 +645,19 @@ void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_ #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; complex_PRECISION b[6]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { int i = 0; - FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); - FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta->vector_buffer = - b[i] ; eta->vector_buffer++; i++; ); + FOR6( *leta = - b[i] ; leta++; i++; ); i = 0; - FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); - FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta->vector_buffer = b[i] ; eta->vector_buffer++; i++; ); + FOR6( *leta = b[i] ; leta++; i++; ); } } else #endif @@ -677,28 +675,27 @@ void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, l int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD) { - FOR24( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN) { - FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } else #endif - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD) { - FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } @@ -710,30 +707,29 @@ void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD){ - FOR12( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); - FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = -(*lphi); lphi++; leta++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } else #endif - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD){ - FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); - FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } @@ -747,24 +743,23 @@ void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECI if ( g.n_flavours == 2 ) { int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; complex_PRECISION b[6]; - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD){ int i = 0; - FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); - FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta->vector_buffer = - b[i] ; eta->vector_buffer++; i++; ); + FOR6( *leta = - b[i] ; leta++; i++; ); i = 0; - FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); - FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta->vector_buffer = b[i] ; eta->vector_buffer++; i++; ); + FOR6( *leta = b[i] ; leta++; i++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } @@ -782,28 +777,27 @@ void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, le int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = 0; lphi++; leta++; ); } i++; } else #endif - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD) { - FOR12( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = 0; lphi++; leta++; ); } i++; } @@ -813,30 +807,29 @@ void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION * int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); - FOR12( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = -(*lphi); lphi++; leta++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = 0; lphi++; leta++; ); } i++; } else #endif - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); - FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR12( *eta->vector_buffer = 0; phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = 0; lphi++; leta++; ); } i++; } @@ -850,24 +843,23 @@ void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECIS if ( g.n_flavours == 2 ) { int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; complex_PRECISION b[6]; - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ int i = 0; - FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); - FOR6( *eta->vector_buffer = -(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta->vector_buffer = - b[i] ; eta->vector_buffer++; i++; ); + FOR6( *leta = - b[i] ; leta++; i++; ); i = 0; - FOR6( b[i] = (*phi->vector_buffer); phi->vector_buffer++; i++; ); - FOR6( *eta->vector_buffer = (*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta->vector_buffer = b[i] ; eta->vector_buffer++; i++; ); + FOR6( *leta = b[i] ; leta++; i++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta->vector_buffer = _COMPLEX_PRECISION_ZERO; phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } @@ -886,28 +878,27 @@ void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, com int i = threading->start_site[l->depth]; buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; - eta->vector_buffer += threading->start_index[l->depth]; - phi->vector_buffer += threading->start_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta->vector_buffer = even*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = even*(*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta->vector_buffer = odd*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR24( *leta = odd*(*lphi); lphi++; leta++; ); } i++; } else #endif - while ( eta->vector_buffer < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta->vector_buffer = even*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = even*(*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD) { - FOR12( *eta->vector_buffer = odd*(*phi->vector_buffer); phi->vector_buffer++; eta->vector_buffer++; ); + FOR12( *leta = odd*(*lphi); lphi++; leta++; ); } i++; } @@ -924,26 +915,27 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION * spin2and3 of flav1 * spin2and3 of flav2 */ - vector_PRECISION serial_end; + buffer_PRECISION serial_end; + buffer_PRECISION serial_pt = serial->vector_buffer, flav1_pt = flav1->vector_buffer, flav2_pt = flav2->vector_buffer; if( g.n_flavours == 2 ) { - serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]; - serial->vector_buffer += threading->start_index[l->depth]; - flav1->vector_buffer += threading->start_index[l->depth]/2; - flav2->vector_buffer += threading->start_index[l->depth]/2; + serial_end = serial->vector_buffer + threading->end_index[l->depth]; + serial_pt += threading->start_index[l->depth]; + flav1_pt += threading->start_index[l->depth]/2; + flav2_pt += threading->start_index[l->depth]/2; } else { - serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]*2; - serial->vector_buffer += threading->start_index[l->depth]*2; - flav1->vector_buffer += threading->start_index[l->depth]; - flav2->vector_buffer += threading->start_index[l->depth]; + serial_end = serial->vector_buffer + threading->end_index[l->depth]*2; + serial_pt += threading->start_index[l->depth]*2; + flav1_pt += threading->start_index[l->depth]; + flav2_pt += threading->start_index[l->depth]; } - while ( serial->vector_buffer < serial_end.vector_buffer ) { - FOR6( *serial->vector_buffer = (*flav1->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) - FOR6( *serial->vector_buffer = (*flav2->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) - FOR6( *serial->vector_buffer = (*flav1->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) - FOR6( *serial->vector_buffer = (*flav2->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) + while ( serial_pt < serial_end ) { + FOR6( *serial_pt = (*flav1_pt); serial_pt++; flav1_pt++; ) + FOR6( *serial_pt = (*flav2_pt); serial_pt++; flav2_pt++; ) + FOR6( *serial_pt = (*flav1_pt); serial_pt++; flav1_pt++; ) + FOR6( *serial_pt = (*flav2_pt); serial_pt++; flav2_pt++; ) } #else START_MASTER(threading) @@ -956,26 +948,27 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 - vector_PRECISION serial_end; - + buffer_PRECISION serial_end; + buffer_PRECISION serial_pt = serial->vector_buffer, flav1_pt = flav1->vector_buffer, flav2_pt = flav2->vector_buffer; + if( g.n_flavours == 2 ) { - serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]; - serial->vector_buffer += threading->start_index[l->depth]; - flav1->vector_buffer += threading->start_index[l->depth]/2; - flav2->vector_buffer += threading->start_index[l->depth]/2; + serial_end = serial->vector_buffer + threading->end_index[l->depth]; + serial_pt += threading->start_index[l->depth]; + flav1_pt += threading->start_index[l->depth]/2; + flav2_pt += threading->start_index[l->depth]/2; } else { - serial_end.vector_buffer = serial->vector_buffer + threading->end_index[l->depth]*2; - serial->vector_buffer += threading->start_index[l->depth]*2; - flav1->vector_buffer += threading->start_index[l->depth]; - flav2->vector_buffer += threading->start_index[l->depth]; + serial_end = serial->vector_buffer + threading->end_index[l->depth]*2; + serial_pt += threading->start_index[l->depth]*2; + flav1_pt += threading->start_index[l->depth]; + flav2_pt += threading->start_index[l->depth]; } - while ( serial->vector_buffer < serial_end.vector_buffer ) { - FOR6( *flav1->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) - FOR6( *flav2->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) - FOR6( *flav1->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav1->vector_buffer++; ) - FOR6( *flav2->vector_buffer = (*serial->vector_buffer); serial->vector_buffer++; flav2->vector_buffer++; ) + while ( serial_pt < serial_end ) { + FOR6( *flav1_pt = (*serial_pt); serial_pt++; flav1_pt++; ) + FOR6( *flav2_pt = (*serial_pt); serial_pt++; flav2_pt++; ) + FOR6( *flav1_pt = (*serial_pt); serial_pt++; flav1_pt++; ) + FOR6( *flav2_pt = (*serial_pt); serial_pt++; flav2_pt++; ) } #else START_MASTER(threading) @@ -995,10 +988,10 @@ void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, op void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ) { buffer_PRECISION eta_end = eta1->vector_buffer + l->inner_vector_size; - - while ( eta1->vector_buffer < eta_end ) { - FOR6( *eta1->vector_buffer = (*phi->vector_buffer)*(*diag); *eta2->vector_buffer = _COMPLEX_PRECISION_ZERO; eta1->vector_buffer++; eta2->vector_buffer++; phi->vector_buffer++; diag++; ); - FOR6( *eta2->vector_buffer = (*phi->vector_buffer)*(*diag); *eta1->vector_buffer = _COMPLEX_PRECISION_ZERO; eta1->vector_buffer++; eta2->vector_buffer++; phi->vector_buffer++; diag++; ); + buffer_PRECISION eta1_pt = eta1->vector_buffer, eta2_pt = eta2->vector_buffer, phi_pt = phi->vector_buffer; + while ( eta1_pt < eta_end ) { + FOR6( *eta1_pt = (*phi_pt)*(*diag); *eta2_pt = _COMPLEX_PRECISION_ZERO; eta1_pt++; eta2_pt++; phi_pt++; diag++; ); + FOR6( *eta2_pt = (*phi_pt)*(*diag); *eta1_pt = _COMPLEX_PRECISION_ZERO; eta1_pt++; eta2_pt++; phi_pt++; diag++; ); } } From a743535687fcd81062ab806be008ef48bd82a102 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 11:47:41 +0300 Subject: [PATCH 04/31] Fixing vector_buffer incrementation 2 --- src/coarse_oddeven_generic.c | 2 +- src/oddeven_generic.c | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c index f6d9441..a6e5ad0 100644 --- a/src/coarse_oddeven_generic.c +++ b/src/coarse_oddeven_generic.c @@ -327,7 +327,7 @@ void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato sc += oo_inv_size*start; for ( int i=start; ivector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_doublet_vectorized + (start/24)*288; - PRECISION *x_pt = (PRECISION*)x->vector_buffer; - PRECISION *y_pt = (PRECISION*)y->vector_buffer; + PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); + PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); for ( int i=start; ivector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_vectorized + (start/12)*144; - PRECISION *x_pt = (PRECISION*)x->vector_buffer; - PRECISION *y_pt = (PRECISION*)y->vector_buffer; + PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); + PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); for ( int i=start; ivector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288; - PRECISION *x_pt = (PRECISION*)x->vector_buffer; - PRECISION *y_pt = (PRECISION*)y->vector_buffer; + PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); + PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); for ( int i=start; ivector_buffer += start; y->vector_buffer += start; PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); - PRECISION *x_pt = (PRECISION*)x->vector_buffer; - PRECISION *y_pt = (PRECISION*)y->vector_buffer; + PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); + PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); for ( int i=start; ivector_buffer[i] = phi->vector_buffer[i]; i++; ) - eta->vector_buffer+=6; phi->vector_buffer+=6; + i+=6; } } } @@ -1460,7 +1456,7 @@ void minus_g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start } } else { for ( int i = start; i < end; ) { - eta->vector_buffer+=6; phi->vector_buffer+=6; + i+=6; FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) } } From 4089cd29610c7f2851d50df8ada2d78025a57432 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 12:12:38 +0300 Subject: [PATCH 05/31] Fixing vector initialization --- src/sse_linalg.c | 36 ++++++++++++++++++------------------ src/var_table.h | 15 ++++++++------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/sse_linalg.c b/src/sse_linalg.c index 2e1bc45..f3f2b28 100644 --- a/src/sse_linalg.c +++ b/src/sse_linalg.c @@ -470,7 +470,7 @@ void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_doub for ( int i=start; ivector_buffer+i) ); __m128d V_re = _mm_loadu_pd( (double*)(V[c].vector_buffer+i) ); z_re = sse_fmadd_pd( alpha_re[c], V_re, z_re ); _mm_storeu_pd( (double*)(z+i), z_re ); @@ -485,10 +485,10 @@ void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_doub FOR6( { __m128d z_re; __m128d z_im; __m128d V_re; __m128d V_im; - sse_complex_deinterleaved_load_pd( (double*)(z+i), &z_re, &z_im ); + sse_complex_deinterleaved_load_pd( (double*)(z->vector_buffer+i), &z_re, &z_im ); sse_complex_deinterleaved_load_pd( (double*)(V[c].vector_buffer+i), &V_re, &V_im ); cfmadd_pd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); - sse_complex_interleaved_store_pd( z_re, z_im, (double*)(z+i) ); + sse_complex_interleaved_store_pd( z_re, z_im, (double*)(z->vector_buffer+i) ); i += SIMD_LENGTH_double; } ) @@ -527,10 +527,10 @@ void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float * for ( int i=start; ivector_buffer+i) ); V_re = _mm_loadu_ps( (float*)(V[c].vector_buffer+i) ); z_re = sse_fmadd( alpha_re[c], V_re, z_re ); - _mm_storeu_ps( (float*)(z+i), z_re ); + _mm_storeu_ps( (float*)(z->vector_buffer+i), z_re ); i+=2; } ) @@ -541,10 +541,10 @@ void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float * for ( int i=start; ivector_buffer+i), &z_re, &z_im ); sse_complex_deinterleaved_load( (float*)(V[c].vector_buffer+i), &V_re, &V_im ); cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); - sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) ); + sse_complex_interleaved_store( z_re, z_im, (float*)(z->vector_buffer+i) ); i+=SIMD_LENGTH_float; } ) @@ -555,20 +555,20 @@ void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float * if ( flag == 0 ) { for ( int c=0; cvector_buffer+i) ); V_re = _mm_loadu_ps( (float*)(V[c].vector_buffer+i) ); z_re = sse_fmadd( alpha_re[c], V_re, z_re ); - _mm_storeu_ps( (float*)(z+i), z_re ); + _mm_storeu_ps( (float*)(z->vector_buffer+i), z_re ); i+=2; } } } else { for ( int c=0; cvector_buffer+i), &z_re, &z_im ); sse_complex_deinterleaved_load( (float*)(V[c].vector_buffer+i), &V_re, &V_im ); cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); - sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) ); + sse_complex_interleaved_store( z_re, z_im, (float*)(z->vector_buffer+i) ); i+=SIMD_LENGTH_float; } } @@ -605,17 +605,17 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ __m128 psi_re; __m128 psi_im; __m128 phi_re; __m128 phi_im; // deinterleave complex numbers into 4 real parts and 4 imag parts - sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im ); + sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i), &psi_re, &psi_im ); sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i), &phi_re, &phi_im ); cmul_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - sse_complex_deinterleaved_load( (float*)(psi+i+4), &psi_re, &psi_im ); + sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i+4), &psi_re, &psi_im ); sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i+4), &phi_re, &phi_im ); cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - sse_complex_deinterleaved_load( (float*)(psi+i+8), &psi_re, &psi_im ); + sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i+8), &psi_re, &psi_im ); sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i+8), &phi_re, &phi_im ); cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); @@ -670,7 +670,7 @@ void process_multi_inner_product_float( int count, complex_float *results, vecto // deinterleave complex numbers into 4 real parts and 4 imag parts sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im ); + sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i), &psi_re, &psi_im ); cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); i+=SIMD_LENGTH_float; @@ -690,7 +690,7 @@ void process_multi_inner_product_float( int count, complex_float *results, vecto // deinterleave complex numbers into 4 real parts and 4 imag parts sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im ); + sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i), &psi_re, &psi_im ); cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); } @@ -744,7 +744,7 @@ void process_multi_inner_product_double( int count, complex_double *results, vec // deinterleave complex numbers into 4 real parts and 4 imag parts sse_complex_deinterleaved_load_pd( (double*)(phi[c].vector_buffer+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load_pd( (double*)(psi+i), &pdi_re, &pdi_im ); + sse_complex_deinterleaved_load_pd( (double*)(psi->vector_buffer+i), &pdi_re, &pdi_im ); cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im); i+=SIMD_LENGTH_double; @@ -764,7 +764,7 @@ void process_multi_inner_product_double( int count, complex_double *results, vec // deinterleave complex numbers into 4 real parts and 4 imag parts sse_complex_deinterleaved_load_pd( (double*)(phi[c].vector_buffer+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load_pd( (double*)(psi+i), &pdi_re, &pdi_im ); + sse_complex_deinterleaved_load_pd( (double*)(psi->vector_buffer+i), &pdi_re, &pdi_im ); cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im); } diff --git a/src/var_table.h b/src/var_table.h index bc9fa36..fc03f11 100644 --- a/src/var_table.h +++ b/src/var_table.h @@ -33,18 +33,19 @@ warning0("SCAN_VAR does not support threading, yet.\n"); \ kind *tmp_var = (kind*)(var_pt); \ kind signum = (start_valvector_buffer, complex_double, l->inner_vector_size ); \ + MALLOC( v.vector_buffer, complex_double, l->inner_vector_size ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ - vector_double_copy( v, &x, 0, l->inner_vector_size, l ); \ - norm_v = global_norm_double( v, 0, l->inner_vector_size, l, no_threading ); \ + vector_double_copy( &v, &x, 0, l->inner_vector_size, l ); \ + norm_v = global_norm_double( &v, 0, l->inner_vector_size, l, no_threading ); \ } \ \ for ( *tmp_var = (kind)start_val; signum*(*tmp_var) <= signum*((kind)end_val) + EPS_double; \ @@ -68,13 +69,13 @@ } \ printf0("scanning variable \"%s\", value: %lf, run %d of %d\n", name, (double)(*tmp_var), i+1, g.vt.average_over ); \ if ( g.vt.track_error ) { \ - apply_operator_double( &b, v, &(g.p), l, no_threading ); \ + apply_operator_double( &b, &v, &(g.p), l, no_threading ); \ vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ if ( g.vt.track_cgn_error ) { \ ASSERT( g.method >=0 && g.p.restart_length >= 4 ); \ vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ cgn_double( &(g.p), l, no_threading ); \ - vector_double_minus( &x, &x, v, 0, l->inner_vector_size, l ); \ + vector_double_minus( &x, &x, &v, 0, l->inner_vector_size, l ); \ g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ printf0("CGN: error norm: %le\n", g.vt.p_end->values[_CGNR_ERR] ); \ vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ @@ -93,7 +94,7 @@ } \ } \ if ( g.vt.track_error ) { \ - FREE( v->vector_buffer, complex_double, l->inner_vector_size ); \ + FREE( v.vector_buffer, complex_double, l->inner_vector_size ); \ } \ tt1 = MPI_Wtime(); \ printf0("\n\ntotal time for parameter scan: %d minutes and %d seconds\n", \ From e0c3a561d4a4dee2f91d3023a33a95d5f6f4d451 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 12:14:09 +0300 Subject: [PATCH 06/31] Fixing vector passing --- src/var_table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/var_table.h b/src/var_table.h index fc03f11..cb83615 100644 --- a/src/var_table.h +++ b/src/var_table.h @@ -88,7 +88,7 @@ else fgmres_double( &(g.p), l, no_threading ); \ if ( i == g.vt.average_over-1 ) prof_print( l ); \ if ( g.vt.track_error ) { \ - vector_double_minus( &x, &x, v, 0, l->inner_vector_size, l ); \ + vector_double_minus( &x, &x, &v, 0, l->inner_vector_size, l ); \ g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ } \ } \ From 33d00df075ae5c85e561c30532978ff79dadb574 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 12:46:11 +0300 Subject: [PATCH 07/31] Fixing vector variable initialitzation --- src/coarse_operator_generic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index d316d0d..44f50ab 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -348,6 +348,8 @@ void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *p lphi1.vector_buffer = lphi.vector_buffer+m*j; coarse_hopp_PRECISION_vectorized( &leta1, &lphi1, Dplus + 4*vectorized_link_offset*k, l ); // daggered hopp + leta2.vector_buffer = leta.vector_buffer+m*j; + lphi2.vector_buffer = lphi.vector_buffer+m*k; coarse_hopp_PRECISION_vectorized( &leta2, &lphi2, Dminus + 4*vectorized_link_offset*k, l ); } } From 0c2b64dae9be402311dd6b3a8eaab14d62c83460 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 12:49:30 +0300 Subject: [PATCH 08/31] Remove debug prints --- src/schwarz_generic.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index ec64fa5..27cf865 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -1649,23 +1649,18 @@ void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi // perform the Schwarz iteration, solve the block systems for ( k=0; kblock_list[step][i]; - printf0("index: %d\n", index); START_MASTER(threading) PROF_PRECISION_START( _SM3 ); END_MASTER(threading) if ( res == _RES ) { if ( k==0 && init_res == _RES ) { - printf0("calling block_op\n"); block_op( Dphi, x, s->block[index].start*l->num_lattice_site_var, s, l, no_threading ); boundary_op( Dphi, x, index, s, l, no_threading ); vector_PRECISION_minus( r, eta, Dphi, s->block[index].start*l->num_lattice_site_var, s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l ); } else { - printf0("calling n_boundary\n"); n_boundary_op( r, latest_iter, index, s, l ); } } @@ -1678,11 +1673,9 @@ void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi START_MASTER(threading) PROF_PRECISION_STOP( _SM4, 1 ); END_MASTER(threading) - printf0(" fin index %d\n", i); } if ( res_comm == _RES && !(k==cycles-1 && (step==6||step==7) && D_phi==NULL) ) { - printf0("calling comms\n"); START_LOCKED_MASTER(threading) for ( mu=0; mu<4; mu++ ) { communicate[(step%4)/2]( (k==0 && step < 6 && init_res == _RES)?x:latest_iter, mu, commdir[step], &(s->op.c), l ); From 7022eb97be7849184c7ebb540a8c3a5c4475605b Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 15:06:23 +0300 Subject: [PATCH 09/31] Correct pointer bugs --- src/linsolve_generic.c | 10 +++++----- src/setup_generic.c | 18 +++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index c8eb06c..0717d4b 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -263,7 +263,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); END_LOCKED_MASTER(threading) } - p->preconditioner( p->w, NULL, p->Z[0], _NO_RES, l, threading ); + p->preconditioner( &(p->w), NULL, &(p->Z[0]), _NO_RES, l, threading ); } else { apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x } @@ -287,7 +287,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p, l, threading ); + arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, 0, p->preconditioner, p, l, threading ); } #endif @@ -302,12 +302,12 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread // one step of Arnoldi #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+2, j+1 ); break; } } else { - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } @@ -1066,7 +1066,7 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { j = il; iter++; - p->preconditioner( p->V[j], p->r, _NO_RES, l, no_threading ); + p->preconditioner( &(p->V[j]), &(p->r), _NO_RES, l, no_threading ); apply_operator_PRECISION( &(p->Z[j]), &(p->V[j]), p, l, no_threading ); for( i=0; ilevel > 0 ) { - testvector_analysis_PRECISION( &(lp->is_PRECISION.test_vector), lp, threading ); + testvector_analysis_PRECISION( lp->is_PRECISION.test_vector, lp, threading ); lp = lp->next_level; if ( lp == NULL ) break; @@ -271,7 +271,7 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T - testvector_analysis_PRECISION( &(l->is_PRECISION.test_vector), l, threading ); + testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading ); #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); @@ -532,7 +532,7 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre } -void testvector_analysis_PRECISION( vector_PRECISION **test_vectors, level_struct *l, struct Thread *threading ) { +void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct *l, struct Thread *threading ) { #ifdef TESTVECTOR_ANALYSIS START_UNTHREADED_FUNCTION(threading) if ( l->depth == 0 ) { @@ -542,12 +542,12 @@ void testvector_analysis_PRECISION( vector_PRECISION **test_vectors, level_struc printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth ); for ( int i=0; inum_eig_vect; i++ ) { printf0("vector #%02d: ", i+1 ); - apply_operator_PRECISION( &(l->vbuf_PRECISION[3]), test_vectors[i], &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[3]), test_vectors+i, &(l->p_PRECISION), l, no_threading ); coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[0]), &(l->vbuf_PRECISION[3]), 0, l->inner_vector_size, l ); - lambda = global_inner_product_PRECISION( test_vectors[i], &(l->vbuf_PRECISION[0]), 0, l->inner_vector_size, l, no_threading ); - lambda /= global_inner_product_PRECISION( test_vectors[i], test_vectors[i], 0, l->inner_vector_size, l, no_threading ); - vector_PRECISION_saxpy( &(l->vbuf_PRECISION[1]), &(l->vbuf_PRECISION[0]), test_vectors[i], -lambda, 0, l->inner_vector_size, l ); - mu = global_norm_PRECISION( &(l->vbuf_PRECISION[1]), 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading ); + lambda = global_inner_product_PRECISION( test_vectors+i, &(l->vbuf_PRECISION[0]), 0, l->inner_vector_size, l, no_threading ); + lambda /= global_inner_product_PRECISION( test_vectors+i, test_vectors+i, 0, l->inner_vector_size, l, no_threading ); + vector_PRECISION_saxpy( &(l->vbuf_PRECISION[1]), &(l->vbuf_PRECISION[0]), test_vectors+i, -lambda, 0, l->inner_vector_size, l ); + mu = global_norm_PRECISION( &(l->vbuf_PRECISION[1]), 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors+i, 0, l->inner_vector_size, l, no_threading ); printf0("singular value: %+lf%+lfi, singular vector precision: %le\n", (double)creal(lambda), (double)cimag(lambda), (double)mu ); } printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth ); From e3d86659c6494ad737fef8a8c2109c17fafbbbda Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 1 Aug 2018 19:06:57 +0300 Subject: [PATCH 10/31] Fixing vectorization bugs --- src/interpolation_generic.c | 4 ++-- src/interpolation_generic.h | 2 +- src/linalg_generic.c | 4 ++-- src/linalg_generic.h | 2 +- src/setup_generic.c | 12 +++++----- src/sse_coarse_operator_generic.h | 6 ++--- src/sse_interpolation_generic.c | 14 +++++------ src/sse_linalg.c | 40 +++++++++++++++---------------- 8 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c index 79b26fe..f6e853e 100644 --- a/src/interpolation_generic.c +++ b/src/interpolation_generic.c @@ -71,7 +71,7 @@ void interpolation_PRECISION_free( level_struct *l ) { } -void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) { +void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading ) { int j, num_eig_vect = l->num_eig_vect; complex_PRECISION *operator = l->is_PRECISION.operator; @@ -83,7 +83,7 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, operator += start*num_eig_vect; for ( int i=start; iinner_vector_size; for ( j=0; jinner_vector_size, l ); + vector_PRECISION_scale( &W_tmp[j], W+j, diag[j], 0, l->inner_vector_size, l ); } process_multi_inner_product_PRECISION( k, ip, W_tmp, v, 0, l->inner_vector_size, l, threading ); diff --git a/src/linalg_generic.h b/src/linalg_generic.h index 2ef5dc1..aa3055d 100644 --- a/src/linalg_generic.h +++ b/src/linalg_generic.h @@ -116,7 +116,7 @@ int start, int end, level_struct *l ); void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ); // z := x - void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION **W, complex_PRECISION *diag, + void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ); void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); diff --git a/src/setup_generic.c b/src/setup_generic.c index 17fcb7c..ef1e967 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -274,11 +274,11 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading ); #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); + define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading ); #else gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading ); - define_interpolation_PRECISION_operator( &(l->is_PRECISION.interpolation->vector_buffer), l, threading ); + define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); #endif } @@ -289,7 +289,7 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); + define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); @@ -303,7 +303,7 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); - define_interpolation_PRECISION_operator( &(l->is_PRECISION.interpolation->vector_buffer), l, threading ); + define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); #endif @@ -388,7 +388,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s #endif #ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( &(l->is_PRECISION.test_vector->vector_buffer), l, threading ); + define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); @@ -401,7 +401,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); - define_interpolation_PRECISION_operator( &(l->is_PRECISION.interpolation->vector_buffer), l, threading ); + define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); #endif diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h index e51f44a..c9a0d3b 100644 --- a/src/sse_coarse_operator_generic.h +++ b/src/sse_coarse_operator_generic.h @@ -78,7 +78,7 @@ #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION int nv = l->num_parent_eig_vect; int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgenmv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta); + cgenmv_padded( 2*nv, D, lda, nv, (float *)phi->vector_buffer, (float *)eta->vector_buffer); #endif } static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, @@ -86,7 +86,7 @@ #ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION int nv = l->num_parent_eig_vect; int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgemv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta); + cgemv_padded( 2*nv, D, lda, nv, (float *)phi->vector_buffer, (float *)eta->vector_buffer); #endif } @@ -103,7 +103,7 @@ for(int i=start; ivector_buffer[i*site_size+j] = 0.0; - cgemv(site_size, clover+i*2*site_size*lda, lda, (float *)(phi+i*site_size), (float *)(eta+i*site_size)); + cgemv(site_size, clover+i*2*site_size*lda, lda, (float *)(phi->vector_buffer+i*site_size), (float *)(eta->vector_buffer+i*site_size)); } #endif } diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c index d51309c..122eb10 100644 --- a/src/sse_interpolation_generic.c +++ b/src/sse_interpolation_generic.c @@ -31,11 +31,11 @@ void interpolation_PRECISION_alloc( level_struct *l ) { MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); - l->is_PRECISION.interpolation[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0]->vector_buffer, complex_PRECISION, n*l->vector_size, 128 ); + MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n ); + l->is_PRECISION.interpolation[0].vector_buffer = NULL; + MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size, 128 ); for ( k=1; kis_PRECISION.interpolation[k]->vector_buffer = l->is_PRECISION.interpolation[0]->vector_buffer + k*l->vector_size; + l->is_PRECISION.interpolation[k].vector_buffer = l->is_PRECISION.interpolation[0].vector_buffer + k*l->vector_size; #endif // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, @@ -71,7 +71,7 @@ void interpolation_PRECISION_free( level_struct *l ) { FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0]->vector_buffer, complex_PRECISION, n*l->vector_size ); + FREE_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size ); FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); #endif FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); @@ -94,7 +94,7 @@ void swap8_PRECISION( PRECISION* data ) { } -void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) { +void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading ) { int j, num_eig_vect = l->num_eig_vect; complex_PRECISION *operator = l->is_PRECISION.operator; @@ -114,7 +114,7 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, for ( int i=start; ivector_buffer+start); + float *xf = (float*)(x->vector_buffer+start); if ( l->depth == 0 ) { for( int i=start; ivector_buffer+i), &x_re, &x_im ); + sse_complex_deinterleaved_load( (float*)(y->vector_buffer+i), &y_re, &y_im ); cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im); - sse_complex_interleaved_store( x_re, x_im, (float*)(z+i) ); + sse_complex_interleaved_store( x_re, x_im, (float*)(z->vector_buffer+i) ); i+=SIMD_LENGTH_float; } ) @@ -132,10 +132,10 @@ void vector_float_saxpy( vector_float *z, vector_float *x, vector_float *y, comp } else { for ( int i=start; ivector_buffer+i), &x_re, &x_im ); + sse_complex_deinterleaved_load( (float*)(y->vector_buffer+i), &y_re, &y_im ); cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im); - sse_complex_interleaved_store( x_re, x_im, (float*)(z+i) ); + sse_complex_interleaved_store( x_re, x_im, (float*)(z->vector_buffer+i) ); i+=SIMD_LENGTH_float; } } @@ -159,10 +159,10 @@ void vector_double_saxpy( vector_double *z, vector_double *x, vector_double *y, FOR6( { __m128d x_re; __m128d x_im; __m128d y_re; __m128d y_im; - sse_complex_deinterleaved_load_pd( (double*)(x+i), &x_re, &x_im ); - sse_complex_deinterleaved_load_pd( (double*)(y+i), &y_re, &y_im ); + sse_complex_deinterleaved_load_pd( (double*)(x->vector_buffer+i), &x_re, &x_im ); + sse_complex_deinterleaved_load_pd( (double*)(y->vector_buffer+i), &y_re, &y_im ); cfmadd_pd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im); - sse_complex_interleaved_store_pd( x_re, x_im, (double*)(z+i) ); + sse_complex_interleaved_store_pd( x_re, x_im, (double*)(z->vector_buffer+i) ); i+=SIMD_LENGTH_double; } ) @@ -194,8 +194,8 @@ complex_double global_inner_product_double( vector_double *phi, vector_double *p { __m128d phi_re; __m128d phi_im; __m128d psi_re; __m128d psi_im; - sse_complex_deinterleaved_load_pd( (double*)(phi+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load_pd( (double*)(psi+i), &psi_re, &psi_im ); + sse_complex_deinterleaved_load_pd( (double*)(phi->vector_buffer+i), &phi_re, &phi_im ); + sse_complex_deinterleaved_load_pd( (double*)(psi->vector_buffer+i), &psi_re, &psi_im ); cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im ); i+=SIMD_LENGTH_double; } @@ -205,8 +205,8 @@ complex_double global_inner_product_double( vector_double *phi, vector_double *p for( int i=thread_start; ivector_buffer+i), &phi_re, &phi_im ); + sse_complex_deinterleaved_load_pd( (double*)(psi->vector_buffer+i), &psi_re, &psi_im ); cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im ); i+=SIMD_LENGTH_double; } @@ -263,8 +263,8 @@ complex_float global_inner_product_float( vector_float *phi, vector_float *psi, __m128 alpha_re = _mm_setzero_ps(); __m128 alpha_im = _mm_setzero_ps(); - float *phif = (float*)(phi+thread_start); - float *psif = (float*)(psi+thread_start); + float *phif = (float*)(phi->vector_buffer+thread_start); + float *psif = (float*)(psi->vector_buffer+thread_start); if ( l->depth == 0 ) { for( int i=thread_start; ivector_buffer+i)); alpha = sse_fmadd( phi, phi, alpha ); i += 2; } @@ -406,7 +406,7 @@ float global_norm_float( vector_float *x, int start, int end, level_struct *l, s } } else { for( int i=thread_start; ivector_buffer+i)); alpha = sse_fmadd( phi, phi, alpha ); i += 2; } @@ -473,7 +473,7 @@ void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_doub __m128d z_re = _mm_loadu_pd( (double*)(z->vector_buffer+i) ); __m128d V_re = _mm_loadu_pd( (double*)(V[c].vector_buffer+i) ); z_re = sse_fmadd_pd( alpha_re[c], V_re, z_re ); - _mm_storeu_pd( (double*)(z+i), z_re ); + _mm_storeu_pd( (double*)(z->vector_buffer+i), z_re ); i++; } ) From b03857cd5ce678dab515bd0059ae69dca4106d1c Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Thu, 2 Aug 2018 17:46:29 +0300 Subject: [PATCH 11/31] Changing vector allocation --- src/DDalphaAMG_interface.c | 58 +++++----- src/coarse_oddeven_generic.c | 69 ++++++------ src/coarse_operator_generic.c | 184 ++++++++++++++++---------------- src/dirac.c | 35 +++--- src/dirac_generic.c | 77 ++++++------- src/gathering_generic.c | 72 ++++++------- src/ghost_generic.c | 81 +++++++------- src/init_generic.c | 88 ++++++--------- src/interpolation_generic.c | 21 ++-- src/linalg_generic.c | 20 ++-- src/main.h | 2 +- src/main_pre_def_generic.h | 5 +- src/oddeven_generic.c | 116 +++++++++----------- src/operator_generic.c | 60 ++++++----- src/schwarz_generic.c | 90 ++++++++-------- src/setup_generic.c | 29 ++--- src/sse_interpolation_generic.c | 27 ++--- src/top_level.c | 12 +-- src/var_table.h | 4 +- src/vector_generic.c | 32 ++++-- src/vector_generic.h | 10 +- 21 files changed, 540 insertions(+), 552 deletions(-) diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c index edc54e7..9f8b45b 100644 --- a/src/DDalphaAMG_interface.c +++ b/src/DDalphaAMG_interface.c @@ -1051,7 +1051,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); buffer_double vb, vx; vector_double *rhs =&(p->b), *sol = &(p->x); - vector_double *source=NULL, *solution=NULL, *solution2=NULL; + vector_double source, solution, solution2; + + vector_double_init( &source ); + vector_double_init( &solution ); + vector_double_init( &solution2 ); DDalphaAMG_status tmp_status; @@ -1182,10 +1186,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i ASSERT( odd_shifts != NULL ); } if ( n_shifts > 1 ) { - MALLOC( source->vector_buffer, complex_double, l.inner_vector_size ); - MALLOC( solution->vector_buffer, complex_double, l.inner_vector_size ); + vector_double_alloc( &source, _INNER, 1, &l, no_threading); + vector_double_alloc( &solution, _INNER, 1, &l, no_threading); if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) - MALLOC( solution2->vector_buffer, complex_double, l.inner_vector_size ); + vector_double_alloc( &solution2, _INNER, 1, &l, no_threading); } for ( n = 0; n < n_shifts; n++ ) { @@ -1221,10 +1225,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) - vector_copy( source, rhs ); + vector_copy( &source, rhs ); solver( ); break; @@ -1232,7 +1236,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE_SQ : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) { THREADED(threading[0]->n_core) @@ -1244,18 +1248,18 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i #endif // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] ); - vector_copy( source, rhs ); + vector_copy( &source, rhs ); } if( n ) - correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); // read NOTE RESIDUAL THREADED(threading[0]->n_core) nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); p->tol = tol[n]/2.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution, sol ); + vector_copy( &solution, sol ); THREADED(threading[0]->n_core) #ifdef HAVE_TM1p1 @@ -1273,7 +1277,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i DDalphaAMG_change_mu_sign( &tmp_status ); if( n ) - vector_copy( sol, solution2 ); + vector_copy( sol, &solution2 ); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1281,7 +1285,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution2, sol ); + vector_copy( &solution2, sol ); // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); @@ -1290,7 +1294,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE_SQ_ODD : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) { THREADED(threading[0]->n_core) @@ -1303,11 +1307,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); - vector_copy( source, rhs ); + vector_copy( &source, rhs ); } if( n ) - correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1315,7 +1319,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = tol[n]/2.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution, sol ); + vector_copy( &solution, sol ); THREADED(threading[0]->n_core) #ifdef HAVE_TM1p1 @@ -1333,7 +1337,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i DDalphaAMG_change_mu_sign( &tmp_status ); if( n ) - vector_copy( sol, solution2 ); + vector_copy( sol, &solution2 ); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1341,7 +1345,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution2, sol ); + vector_copy( &solution2, sol ); // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); @@ -1350,7 +1354,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE_SQ_EVEN : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) { THREADED(threading[0]->n_core) @@ -1363,11 +1367,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); - vector_copy( source, rhs ); + vector_copy( &source, rhs ); } if( n ) - correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1375,7 +1379,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = tol[n]/2.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution, sol ); + vector_copy( &solution, sol ); THREADED(threading[0]->n_core) #ifdef HAVE_TM1p1 @@ -1393,14 +1397,14 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i DDalphaAMG_change_mu_sign( &tmp_status ); if( n ) - vector_copy( sol, solution2 ); + vector_copy( sol, &solution2 ); // read NOTE RESIDUAL THREADED(threading[0]->n_core) nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution2, sol ); + vector_copy( &solution2, sol ); // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); @@ -1489,10 +1493,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->initial_guess_zero = 1; if ( n_shifts > 0 ) { - FREE( source->vector_buffer, complex_double, l.inner_vector_size ); - FREE( solution->vector_buffer, complex_double, l.inner_vector_size ); + vector_double_free( &source, &l, no_threading); + vector_double_free( &solution, &l, no_threading); if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) - FREE( solution2->vector_buffer, complex_double, l.inner_vector_size ); + vector_double_free( &solution2, &l, no_threading); } diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c index a6e5ad0..af7000b 100644 --- a/src/coarse_oddeven_generic.c +++ b/src/coarse_oddeven_generic.c @@ -309,9 +309,9 @@ void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; - vector_PRECISION x_pt, y_pt; #ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION + vector_PRECISION x_pt, y_pt; int num_site_var=l->num_lattice_site_var, oo_inv_size = SQUARE(num_site_var); #ifdef HAVE_TM1p1 @@ -335,7 +335,7 @@ void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato #else compute_core_start_end_custom( op->num_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 ); - coarse_self_couplings_PRECISION_vectorized( &y_pt, &x_pt, op, start, end, l ); + coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l ); #endif } @@ -382,7 +382,7 @@ void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, ope #else for(int j=0; jbuffer, vector_PRECISION, 2 ); - vector_PRECISION_init(&(op->buffer[0])); + for (int k=0; k<2; k++ ){ + vector_PRECISION_init( &(op->buffer[k]) ); #ifdef HAVE_TM1p1 - MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 4*l->vector_size ); - op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + 2*l->vector_size; + vector_PRECISION_alloc( &(op->buffer[k]), _ORDINARY, 2, l, no_threading ); #else - MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 2*l->vector_size ); - op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + l->vector_size; + vector_PRECISION_alloc( &(op->buffer[k]), _ORDINARY, 1, l, no_threading ); #endif - + } for ( mu=0; mu<4; mu++ ) { le[mu] = l->local_lattice[mu]; N[mu] = le[mu]+1; @@ -604,7 +603,7 @@ void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, void coarse_oddeven_free_PRECISION( level_struct *l ) { - int nv = l->num_parent_eig_vect, vs = l->vector_size; + int nv = l->num_parent_eig_vect; operator_PRECISION_struct *op = &(l->oe_op_PRECISION); operator_PRECISION_free( op, _ODDEVEN, l ); @@ -626,12 +625,8 @@ void coarse_oddeven_free_PRECISION( level_struct *l ) { #endif #endif - -#ifdef HAVE_TM1p1 - FREE( op->buffer[0].vector_buffer, complex_PRECISION, 4*vs ); -#else - FREE( op->buffer[0].vector_buffer, complex_PRECISION, 2*vs ); -#endif + for (int k=0; k<2; k++ ) + vector_PRECISION_free( &(op->buffer[k]), l, no_threading ); FREE( op->buffer, vector_PRECISION, 2 ); } @@ -1474,52 +1469,52 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_ void coarse_odd_even_PRECISION_test( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { if ( g.odd_even ) { - vector_PRECISION buf1, buf2; - - vector_PRECISION_init(&buf1); - vector_PRECISION_init(&buf2); - - PUBLIC_MALLOC( buf1.vector_buffer, complex_PRECISION, 2*l->vector_size ); - buf2.vector_buffer = buf1.vector_buffer + l->vector_size; + vector_PRECISION buf[2]; + for(int i=0; i<2; i++){ + vector_PRECISION_init( &buf[i] ); + vector_PRECISION_alloc( &buf[i], _ORDINARY, 1, l, threading ); + } + START_LOCKED_MASTER(threading) // transformation part - vector_PRECISION_copy( &buf1, in, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &buf[0], in, 0, l->inner_vector_size, l ); // even to odd vector_PRECISION_define( out, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) - coarse_hopping_term_PRECISION( out, &buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); - coarse_diag_oo_inv_PRECISION( &buf2, out, &(l->oe_op_PRECISION), l, threading ); + coarse_hopping_term_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); + coarse_diag_oo_inv_PRECISION( &buf[1], out, &(l->oe_op_PRECISION), l, threading ); START_LOCKED_MASTER(threading) - vector_PRECISION_plus( &buf1, &buf1, &buf2, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_plus( &buf[0], &buf[0], &buf[1], l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) // block diagonal part if ( g.method == 6 ) { - g5D_coarse_apply_schur_complement_PRECISION( out, &buf1, &(l->oe_op_PRECISION), l, threading ); + g5D_coarse_apply_schur_complement_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading ); } else { - coarse_apply_schur_complement_PRECISION( out, &buf1, &(l->oe_op_PRECISION), l, threading ); + coarse_apply_schur_complement_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading ); } - coarse_diag_oo_PRECISION( out, &buf1, &(l->oe_op_PRECISION), l, threading ); + coarse_diag_oo_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading ); // back transformation part - coarse_diag_oo_inv_PRECISION( &buf2, out, &(l->oe_op_PRECISION), l, threading ); + coarse_diag_oo_inv_PRECISION( &buf[1], out, &(l->oe_op_PRECISION), l, threading ); if ( g.method == 6 ) { START_LOCKED_MASTER(threading) coarse_gamma5_PRECISION( out, out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - vector_PRECISION_define( &buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - coarse_hopping_term_PRECISION( &buf1, &buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); - coarse_gamma5_PRECISION( &buf1, &buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - vector_PRECISION_plus( out, out, &buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + vector_PRECISION_define( &buf[0], 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + coarse_hopping_term_PRECISION( &buf[0], &buf[1], &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + coarse_gamma5_PRECISION( &buf[0], &buf[0], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + vector_PRECISION_plus( out, out, &buf[0], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); END_LOCKED_MASTER(threading) } else { - coarse_hopping_term_PRECISION( out, &buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); + coarse_hopping_term_PRECISION( out, &buf[1], &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); } - PUBLIC_FREE( buf1.vector_buffer, complex_PRECISION, 2*l->vector_size ); + for(int i=0; i<2; i++) + vector_PRECISION_free( &buf[i], l, threading ); } } diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index 44f50ab..5d8750f 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -728,22 +728,22 @@ void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECI void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading ) { if ( !l->idle ) { - int vs = l->vector_size, ivs = l->inner_vector_size, - cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size; + int ivs = l->inner_vector_size, civs = l->next_level->inner_vector_size; PRECISION diff = 0; - vector_PRECISION vp1, vp2, vp3, vp4, vc1, vc2, vc3; - - vector_PRECISION_init(&vp1); - vector_PRECISION_init(&vc1); + vector_PRECISION vp[4], vc[3]; + + for(int i=0; i<4; i++){ + vector_PRECISION_init( &vp[i] ); + vector_PRECISION_alloc( &vp[i], _ORDINARY, 1, l, threading ); + } - PUBLIC_MALLOC( vp1.vector_buffer, complex_PRECISION, 4*vs ); - PUBLIC_MALLOC( vc1.vector_buffer, complex_PRECISION, 3*cvs ); + for(int i=0; i<3; i++){ + vector_PRECISION_init( &vc[i] ); + vector_PRECISION_alloc( &vc[i], _ORDINARY, 1, l->next_level, threading ); + } SYNC_MASTER_TO_ALL(threading) - vp2.vector_buffer = vp1.vector_buffer + vs; vp3.vector_buffer = vp2.vector_buffer + vs; vp4.vector_buffer = vp3.vector_buffer + vs; - vc2.vector_buffer = vc1.vector_buffer + cvs; vc3.vector_buffer = vc2.vector_buffer + cvs; - START_LOCKED_MASTER(threading) #ifdef HAVE_TM1p1 if(g.n_flavours == 1) @@ -767,33 +767,33 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr } if ( !l->next_level->idle ) - vector_PRECISION_define_random( &vc1, 0, civs, l->next_level ); - vector_PRECISION_distribute( &vc2, &vc1, l->next_level ); - vector_PRECISION_gather( &vc3, &vc2, l->next_level ); + vector_PRECISION_define_random( &vc[0], 0, civs, l->next_level ); + vector_PRECISION_distribute( &vc[1], &vc[0], l->next_level ); + vector_PRECISION_gather( &vc[2], &vc[1], l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc2, &vc1, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); } test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff ); if ( !l->next_level->idle ) - vector_PRECISION_define_random( &vc1, 0, civs, l->next_level ); - interpolate3_PRECISION( &vp1, &vc1, l, no_threading ); - restrict_PRECISION( &vc2, &vp1, l, no_threading ); + vector_PRECISION_define_random( &vc[0], 0, civs, l->next_level ); + interpolate3_PRECISION( &vp[0], &vc[0], l, no_threading ); + restrict_PRECISION( &vc[1], &vp[0], l, no_threading ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc3, &vc1, &vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[0], &vc[1], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) ); } END_LOCKED_MASTER(threading) if(threading->n_core>1) { - interpolate3_PRECISION( &vp1, &vc1, l, threading ); - restrict_PRECISION( &vc2, &vp1, l, threading ); + interpolate3_PRECISION( &vp[0], &vc[0], l, threading ); + restrict_PRECISION( &vc[1], &vp[0], l, threading ); START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc3, &vc1, &vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[0], &vc[1], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -801,27 +801,27 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if (l->depth==0) - gamma5_PRECISION( &vp2, &vp1, l, no_threading ); + gamma5_PRECISION( &vp[1], &vp[0], l, no_threading ); else - coarse_gamma5_PRECISION( &vp2, &vp1, 0, ivs, l ); - restrict_PRECISION( &vc2, &vp2, l, no_threading ); - coarse_gamma5_PRECISION( &vc3, &vc2, 0, civs, l->next_level ); + coarse_gamma5_PRECISION( &vp[1], &vp[0], 0, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); + coarse_gamma5_PRECISION( &vc[2], &vc[1], 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc2, &vc1, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } #ifdef HAVE_TM1p1 if(g.n_flavours == 2) { if (l->depth==0) - tau1_gamma5_PRECISION( &vp2, &vp1, l, no_threading ); + tau1_gamma5_PRECISION( &vp[1], &vp[0], l, no_threading ); else - coarse_tau1_gamma5_PRECISION( &vp2, &vp1, 0, ivs, l ); - restrict_PRECISION( &vc2, &vp2, l, no_threading ); - coarse_tau1_gamma5_PRECISION( &vc3, &vc2, 0, civs, l->next_level ); + coarse_tau1_gamma5_PRECISION( &vp[1], &vp[0], 0, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); + coarse_tau1_gamma5_PRECISION( &vc[2], &vc[1], 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc2, &vc1, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } } @@ -829,32 +829,32 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr END_LOCKED_MASTER(threading) START_LOCKED_MASTER(threading) - vector_PRECISION_define( &vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp[1], 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.odd_proj, ivs ); + add_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.odd_proj, ivs ); else - coarse_add_block_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.odd_proj, ivs, l ); - restrict_PRECISION( &vc2, &vp2, l, no_threading ); + coarse_add_block_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.odd_proj, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); - vector_PRECISION_scale( &vc2, &vc2, -1.0, 0, civs, l->next_level ); - coarse_add_block_diagonal_PRECISION( &vc2, &vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc[1], &vc[1], -1.0, 0, civs, l->next_level ); + coarse_add_block_diagonal_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) #ifdef HAVE_TM START_LOCKED_MASTER(threading) if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { - vector_PRECISION_define( &vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp[1], 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.tm_term, ivs ); + add_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( &vp2, &vp1, l->s_PRECISION.op.tm_term, ivs, l ); - restrict_PRECISION( &vc2, &vp2, l, no_threading ); + coarse_add_anti_block_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.tm_term, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); - vector_PRECISION_scale( &vc2, &vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); - coarse_add_anti_block_diagonal_PRECISION( &vc2, &vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc[1], &vc[1], -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); + coarse_add_anti_block_diagonal_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -864,16 +864,16 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if ( g.n_flavours == 2 && ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) { - vector_PRECISION_define( &vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp[1], 0, 0, ivs, l ); if (l->depth==0) - apply_doublet_coupling_PRECISION( &vp2, &vp1, l->s_PRECISION.op.epsbar_term, ivs ); + apply_doublet_coupling_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.epsbar_term, ivs ); else - coarse_add_doublet_coupling_PRECISION( &vp2, &vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); - restrict_PRECISION( &vc2, &vp2, l, no_threading ); + coarse_add_doublet_coupling_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.epsbar_term, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); - vector_PRECISION_scale( &vc2, &vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); - coarse_add_doublet_coupling_PRECISION( &vc2, &vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); - diff = global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc[1], &vc[1], -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); + coarse_add_doublet_coupling_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -881,30 +881,30 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if ( l->level > 0 ) { START_LOCKED_MASTER(threading) - interpolate3_PRECISION( &vp1, &vc1, l, no_threading ); + interpolate3_PRECISION( &vp[0], &vc[0], l, no_threading ); - apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); #ifdef HAVE_TM if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) { - vector_PRECISION_scale( &vp3, &vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); + vector_PRECISION_scale( &vp[2], &vp[0], (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); if(l->depth == 0) - add_diagonal_PRECISION( &vp2, &vp3, l->p_PRECISION.op->tm_term, ivs ); + add_diagonal_PRECISION( &vp[1], &vp[2], l->p_PRECISION.op->tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( &vp2, &vp3, l->p_PRECISION.op->tm_term, ivs, l ); + coarse_add_anti_block_diagonal_PRECISION( &vp[1], &vp[2], l->p_PRECISION.op->tm_term, ivs, l ); } #endif - restrict_PRECISION( &vc2, &vp2, l, no_threading ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( &vc3, &vc1, l->next_level, no_threading ); + coarse_odd_even_PRECISION_test( &vc[2], &vc[0], l->next_level, no_threading ); else - apply_operator_PRECISION( &vc3, &vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); + apply_operator_PRECISION( &vc[2], &vc[0], &(l->next_level->p_PRECISION), l->next_level, no_threading ); - vector_PRECISION_minus( &vc3, &vc2, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[1], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); @@ -917,14 +917,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if(threading->n_core>1) { if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( &vc3, &vc1, l->next_level, threading ); + coarse_odd_even_PRECISION_test( &vc[2], &vc[0], l->next_level, threading ); else - apply_operator_PRECISION( &vc3, &vc1, &(l->next_level->p_PRECISION), l->next_level, threading ); + apply_operator_PRECISION( &vc[2], &vc[0], &(l->next_level->p_PRECISION), l->next_level, threading ); } START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( &vc3, &vc2, &vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( &vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc2, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[1], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { //TODO: this test doesn't work without SSE!! test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); } else { @@ -937,29 +937,33 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) { - vector_PRECISION_define_random( &vp1, 0, ivs, l ); - block_to_oddeven_PRECISION( &vp4, &vp1, l, no_threading ); - coarse_diag_ee_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), l, no_threading ); - coarse_diag_oo_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), l, no_threading ); - coarse_hopping_term_PRECISION( &vp3, &vp4, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); - oddeven_to_block_PRECISION( &vp4, &vp3, l, no_threading ); - apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); - vector_PRECISION_minus( &vp4, &vp4, &vp2, 0, ivs, l ); - diff = global_norm_PRECISION( &vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp2, 0, ivs, l, no_threading ); + vector_PRECISION_define_random( &vp[0], 0, ivs, l ); + block_to_oddeven_PRECISION( &vp[3], &vp[0], l, no_threading ); + coarse_diag_ee_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), l, no_threading ); + coarse_diag_oo_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), l, no_threading ); + coarse_hopping_term_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); + oddeven_to_block_PRECISION( &vp[3], &vp[2], l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_minus( &vp[3], &vp[3], &vp[1], 0, ivs, l ); + diff = global_norm_PRECISION( &vp[3], 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp[1], 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff ); - block_to_oddeven_PRECISION( &vp4, &vp1, l, no_threading ); - coarse_odd_even_PRECISION_test( &vp3, &vp4, l, no_threading ); - oddeven_to_block_PRECISION( &vp4, &vp3, l, no_threading ); - apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); - vector_PRECISION_minus( &vp4, &vp4, &vp2, 0, ivs, l ); - diff = global_norm_PRECISION( &vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp2, 0, ivs, l, no_threading ); + block_to_oddeven_PRECISION( &vp[3], &vp[0], l, no_threading ); + coarse_odd_even_PRECISION_test( &vp[2], &vp[3], l, no_threading ); + oddeven_to_block_PRECISION( &vp[3], &vp[2], l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_minus( &vp[3], &vp[3], &vp[1], 0, ivs, l ); + diff = global_norm_PRECISION( &vp[3], 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp[1], 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff ); } + + for(int i=0; i<4; i++) + vector_PRECISION_free( &vp[i], l, threading ); - FREE( vp1.vector_buffer, complex_PRECISION, 4*vs ); - FREE( vc1.vector_buffer, complex_PRECISION, 3*cvs ); - END_LOCKED_MASTER(threading) + for(int i=0; i<3; i++) + vector_PRECISION_free( &vc[i], l->next_level, threading ); + + END_LOCKED_MASTER(threading) if ( g.method != 6 && l->next_level->level > 0 && !l->next_level->idle ) { schwarz_PRECISION_mvm_testfun( &(l->next_level->s_PRECISION), l->next_level, threading ); diff --git a/src/dirac.c b/src/dirac.c index 94d4aca..e04fe20 100644 --- a/src/dirac.c +++ b/src/dirac.c @@ -436,12 +436,7 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { int t, z, y, x, mu, nu, *ll = l->local_lattice, ls[4], le[4]; long int i, j, send_size, max_size; - vector_double buffer1, buffer2, buffer3, buffer4; - - vector_double_init(&buffer1); - vector_double_init(&buffer2); - vector_double_init(&buffer3); - vector_double_init(&buffer4); + buffer_double buffer1 = NULL, buffer2 = NULL, buffer3 = NULL, buffer4 = NULL; max_size = 0; for ( mu=0; mu<4; mu++ ) { @@ -453,10 +448,10 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { if (send_size > max_size) max_size = send_size; } - MALLOC( buffer1.vector_buffer, complex_double, max_size ); - MALLOC( buffer2.vector_buffer, complex_double, max_size ); - MALLOC( buffer3.vector_buffer, complex_double, max_size ); - MALLOC( buffer4.vector_buffer, complex_double, max_size ); + MALLOC( buffer1, complex_double, max_size ); + MALLOC( buffer2, complex_double, max_size ); + MALLOC( buffer3, complex_double, max_size ); + MALLOC( buffer4, complex_double, max_size ); for ( mu=0; mu<4; mu++ ) { ls[mu] = 1; @@ -472,13 +467,13 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { for ( y=ls[Y]; yneighbor_rank[2*mu], 2*mu, g.comm_cart, &(g.rreqs[2*mu]) ); - MPI_Isend( buffer1.vector_buffer, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu+1], 2*mu, g.comm_cart, &(g.sreqs[2*mu]) ); + MPI_Irecv( buffer3, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu], 2*mu, g.comm_cart, &(g.rreqs[2*mu]) ); + MPI_Isend( buffer1, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu+1], 2*mu, g.comm_cart, &(g.sreqs[2*mu]) ); // send own positive inner boundary ls[mu] = ll[mu]; @@ -488,13 +483,13 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { for ( y=ls[Y]; yneighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(g.rreqs[2*mu+1]) ); - MPI_Isend( buffer2.vector_buffer, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(g.sreqs[2*mu+1]) ); + MPI_Irecv( buffer4, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(g.rreqs[2*mu+1]) ); + MPI_Isend( buffer2, send_size, MPI_COMPLEX_double, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(g.sreqs[2*mu+1]) ); //recv own positive boundary MPI_Wait( &(g.sreqs[2*mu]), MPI_STATUS_IGNORE ); @@ -507,7 +502,7 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { for ( y=ls[Y]; yinner_vector_size; - - PUBLIC_MALLOC( vd1.vector_buffer, complex_double, 4*ivs + 2*4*ivs ); - PUBLIC_MALLOC( vpp1.vector_buffer, complex_PRECISION, 2*2*ivs ); - - vd2.vector_buffer = vd1.vector_buffer + ivs; vd3.vector_buffer = vd2.vector_buffer + ivs; vd4.vector_buffer = vd3.vector_buffer + ivs; - vdd1.vector_buffer = vd4.vector_buffer + ivs; vdd2.vector_buffer = vdd1.vector_buffer + 2*ivs; vdd3.vector_buffer = vdd2.vector_buffer + 2*ivs; vdd4.vector_buffer = vdd3.vector_buffer + 2*ivs; - vpp2.vector_buffer = vpp1.vector_buffer + 2*ivs; - START_LOCKED_MASTER(threading) - vector_double_define_random( &vd1, 0, l->inner_vector_size, l ); - vector_double_define_random( &vd2, 0, l->inner_vector_size, l ); - apply_operator_double( &vd3, &vd1, &(g.p), l, no_threading ); + vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); + vector_double_define_random( &vd[1], 0, l->inner_vector_size, l ); + apply_operator_double( &vd[2], &vd[0], &(g.p), l, no_threading ); #ifdef HAVE_TM buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); #endif - apply_operator_double( &vd4, &vd2, &(g.p), l, no_threading ); + apply_operator_double( &vd[3], &vd[1], &(g.p), l, no_threading ); #ifdef HAVE_TM buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); #endif - add_diagonal_double( &vd3, &vd2, g.op_double.epsbar_term, l->inner_vector_size ); - add_diagonal_double( &vd4, &vd1, g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( &vd[2], &vd[1], g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( &vd[3], &vd[0], g.op_double.epsbar_term, l->inner_vector_size ); - two_flavours_to_serial_double( &vd1, &vd2, &vdd1, l, no_threading ); - two_flavours_to_serial_double( &vd3, &vd4, &vdd2, l, no_threading ); + two_flavours_to_serial_double( &vd[0], &vd[1], &vdd[0], l, no_threading ); + two_flavours_to_serial_double( &vd[2], &vd[3], &vdd[1], l, no_threading ); END_LOCKED_MASTER(threading) data_layout_n_flavours( 2, l, threading ); START_LOCKED_MASTER(threading) - trans_PRECISION( &vpp1, &vdd1, op->translation_table, l, no_threading ); - apply_operator_PRECISION( &vpp2, &vpp1, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( &vdd3, &vpp2, op->translation_table, l, no_threading ); + trans_PRECISION( &vpp[0], &vdd[0], op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vpp[1], &vpp[0], &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION( &vdd[2], &vpp[1], op->translation_table, l, no_threading ); - vector_double_minus( &vdd4, &vdd3, &vdd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( &vdd4, 0, l->inner_vector_size, l, no_threading ) / - global_norm_double( &vdd3, 0, l->inner_vector_size, l, no_threading ); + vector_double_minus( &vdd[3], &vdd[2], &vdd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vdd[3], 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( &vdd[2], 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - trans_PRECISION( &vpp1, &vdd1, op->translation_table, l, threading ); - apply_operator_PRECISION( &vpp2, &vpp1, &(l->p_PRECISION), l, threading ); - trans_back_PRECISION( &vdd3, &vpp2, op->translation_table, l, threading ); + trans_PRECISION( &vpp[0], &vdd[0], op->translation_table, l, threading ); + apply_operator_PRECISION( &vpp[1], &vpp[0], &(l->p_PRECISION), l, threading ); + trans_back_PRECISION( &vdd[2], &vpp[1], op->translation_table, l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) START_LOCKED_MASTER(threading) - vector_double_minus( &vdd4, &vdd3, &vdd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( &vdd4, 0, l->inner_vector_size, l, no_threading ) / - global_norm_double( &vdd3, 0, l->inner_vector_size, l, no_threading ); + vector_double_minus( &vdd[3], &vdd[2], &vdd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vdd[3], 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( &vdd[2], 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION with threading: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) } - PUBLIC_FREE( vd1.vector_buffer, complex_double, 4*ivs + 2*4*ivs ); - PUBLIC_FREE( vpp1.vector_buffer, complex_PRECISION, 2*2*ivs ); + for(int i=0; i<4; i++){ + vector_double_free( &vd[i], l, threading ); + vector_double_free( &vdd[i], l, threading ); + } + + for(int i=0; i<2; i++) + vector_PRECISION_free( &vpp[i], l, threading ); START_LOCKED_MASTER(threading) if ( g.method >=4 && g.odd_even ) diff --git a/src/gathering_generic.c b/src/gathering_generic.c index d4952fb..fbf0445 100644 --- a/src/gathering_generic.c +++ b/src/gathering_generic.c @@ -96,9 +96,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l MALLOC( gs->permutation, int, l->num_inner_lattice_sites ); MALLOC( gs->reqs, MPI_Request, gs->gather_list_length ); #ifdef HAVE_TM1p1 - MALLOC( gs->buffer.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); + vector_PRECISION_alloc( &(gs->buffer), _INNER, 2, l, no_threading ); #else - MALLOC( gs->buffer.vector_buffer, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_alloc( &(gs->buffer), _INNER, 1, l, no_threading ); #endif MALLOC( field1, int, l->num_inner_lattice_sites ); MALLOC( field2, int, l->num_inner_lattice_sites ); @@ -212,11 +212,7 @@ void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l ) FREE( gs->gather_list, int, gs->gather_list_length ); FREE( gs->permutation, int, l->num_inner_lattice_sites ); FREE( gs->reqs, MPI_Request, gs->gather_list_length ); -#ifdef HAVE_TM1p1 - FREE( gs->buffer.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); -#else - FREE( gs->buffer.vector_buffer, complex_PRECISION, l->inner_vector_size ); -#endif + vector_PRECISION_free( &(gs->buffer), l, no_threading ); } MPI_Comm_free( &(gs->level_comm) ); @@ -270,30 +266,24 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t, *pi = l->gs_PRECISION.permutation; - vector_PRECISION buffer_hopp, buffer_clov, buffer_odd_proj; + buffer_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL; MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL, *odd_proj_reqs = NULL; - - vector_PRECISION_init(&buffer_hopp); - vector_PRECISION_init(&buffer_clov); - vector_PRECISION_init(&buffer_odd_proj); #ifdef HAVE_TM1p1 - vector_PRECISION buffer_eps_term; - vector_PRECISION_init(&buffer_eps_term); + buffer_PRECISION buffer_eps_term = NULL; MPI_Request *eps_term_reqs = NULL; - MALLOC( buffer_eps_term.vector_buffer, complex_PRECISION, n*send_size_block ); + MALLOC( buffer_eps_term, complex_PRECISION, n*send_size_block ); MALLOC( eps_term_reqs, MPI_Request, n ); #endif #ifdef HAVE_TM - vector_PRECISION buffer_tm_term; - vector_PRECISION_init(&buffer_tm_term); + buffer_PRECISION buffer_tm_term = NULL; MPI_Request *tm_term_reqs = NULL; - MALLOC( buffer_tm_term.vector_buffer, complex_PRECISION, n*send_size_block ); + MALLOC( buffer_tm_term, complex_PRECISION, n*send_size_block ); MALLOC( tm_term_reqs, MPI_Request, n ); #endif - MALLOC( buffer_hopp.vector_buffer, complex_PRECISION, n*send_size_hopp ); - MALLOC( buffer_clov.vector_buffer, complex_PRECISION, n*send_size_clov ); - MALLOC( buffer_odd_proj.vector_buffer, complex_PRECISION, n*send_size_block ); + MALLOC( buffer_hopp, complex_PRECISION, n*send_size_hopp ); + MALLOC( buffer_clov, complex_PRECISION, n*send_size_clov ); + MALLOC( buffer_odd_proj, complex_PRECISION, n*send_size_block ); MALLOC( hopp_reqs, MPI_Request, n ); MALLOC( clov_reqs, MPI_Request, n ); MALLOC( odd_proj_reqs, MPI_Request, n ); @@ -301,39 +291,39 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], 4, g.comm_cart, &(eps_term_reqs[i]) ); #endif #ifdef HAVE_TM - MPI_Irecv( buffer_tm_term.vector_buffer+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_tm_term+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 3, g.comm_cart, &(tm_term_reqs[i]) ); #endif - MPI_Irecv( buffer_hopp.vector_buffer+i*send_size_hopp, send_size_hopp, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_hopp+i*send_size_hopp, send_size_hopp, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 0, g.comm_cart, &(hopp_reqs[i]) ); - MPI_Irecv( buffer_clov.vector_buffer+i*send_size_clov, send_size_clov, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_clov+i*send_size_clov, send_size_clov, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 1, g.comm_cart, &(clov_reqs[i]) ); - MPI_Irecv( buffer_odd_proj.vector_buffer+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer_odd_proj+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 2, g.comm_cart, &(odd_proj_reqs[i]) ); } PROF_PRECISION_STOP( _GD_COMM, 2*n-2 ); #ifdef HAVE_TM1p1 for ( i=0; iepsbar_term[i]; + buffer_eps_term[i] = in->epsbar_term[i]; #endif #ifdef HAVE_TM for ( i=0; itm_term[i]; + buffer_tm_term[i] = in->tm_term[i]; #endif for ( i=0; iD[i]; + buffer_hopp[i] = in->D[i]; for ( i=0; iclover[i]; + buffer_clov[i] = in->clover[i]; for ( i=0; iodd_proj[i]; + buffer_odd_proj[i] = in->odd_proj[i]; #ifdef HAVE_TM1p1 PROF_PRECISION_START( _GD_IDLE ); @@ -344,7 +334,7 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s t = (send_size_block*n)/s; for ( i=0; iepsbar_term[ t*pi[i] + j ] = buffer_eps_term.vector_buffer[ t*i + j ]; + out->epsbar_term[ t*pi[i] + j ] = buffer_eps_term[ t*i + j ]; #endif #ifdef HAVE_TM PROF_PRECISION_START( _GD_IDLE ); @@ -355,7 +345,7 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s t = (send_size_block*n)/s; for ( i=0; itm_term[ t*pi[i] + j ] = buffer_tm_term.vector_buffer[ t*i + j ]; + out->tm_term[ t*pi[i] + j ] = buffer_tm_term[ t*i + j ]; #endif PROF_PRECISION_START( _GD_IDLE ); @@ -366,7 +356,7 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s t = (send_size_hopp*n)/s; for ( i=0; iD[ t*pi[i] + j ] = buffer_hopp.vector_buffer[ t*i + j ]; + out->D[ t*pi[i] + j ] = buffer_hopp[ t*i + j ]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iclover[ t*pi[i] + j ] = buffer_clov.vector_buffer[ t*i + j ]; + out->clover[ t*pi[i] + j ] = buffer_clov[ t*i + j ]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iodd_proj[ t*pi[i] + j ] = buffer_odd_proj.vector_buffer[ t*i + j ]; + out->odd_proj[ t*pi[i] + j ] = buffer_odd_proj[ t*i + j ]; - FREE( buffer_hopp.vector_buffer, complex_PRECISION, n*send_size_hopp ); - FREE( buffer_clov.vector_buffer, complex_PRECISION, n*send_size_clov ); - FREE( buffer_odd_proj.vector_buffer, complex_PRECISION, n*send_size_block ); + FREE( buffer_hopp, complex_PRECISION, n*send_size_hopp ); + FREE( buffer_clov, complex_PRECISION, n*send_size_clov ); + FREE( buffer_odd_proj, complex_PRECISION, n*send_size_block ); FREE( hopp_reqs, MPI_Request, n ); FREE( clov_reqs, MPI_Request, n ); FREE( odd_proj_reqs, MPI_Request, n ); #ifdef HAVE_TM - FREE( buffer_tm_term.vector_buffer, complex_PRECISION, n*send_size_block ); + FREE( buffer_tm_term, complex_PRECISION, n*send_size_block ); FREE( tm_term_reqs, MPI_Request, n ); #endif #ifdef HAVE_TM1p1 - FREE( buffer_eps_term.vector_buffer, complex_PRECISION, n*send_size_block ); + FREE( buffer_eps_term, complex_PRECISION, n*send_size_block ); FREE( eps_term_reqs, MPI_Request, n ); #endif diff --git a/src/ghost_generic.c b/src/ghost_generic.c index 9dee72c..b36d78f 100644 --- a/src/ghost_generic.c +++ b/src/ghost_generic.c @@ -89,8 +89,6 @@ void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_stru MPI_Wait( &(c->rreqs[2*mu+1]), MPI_STATUS_IGNORE ); } } - - void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l ) { int mu, nu, factor=1; @@ -124,8 +122,8 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str c->length[2*mu] = buffer_size; c->length[2*mu+1] = buffer_size; c->max_length[mu] = factor*buffer_size; - MALLOC( c->buffer[2*mu].vector_buffer, complex_PRECISION, factor*buffer_size ); - MALLOC( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, factor*buffer_size ); + MALLOC( c->buffer[2*mu], complex_PRECISION, factor*buffer_size ); + MALLOC( c->buffer[2*mu+1], complex_PRECISION, factor*buffer_size ); c->in_use[2*mu] = 0; c->in_use[2*mu+1] = 0; } @@ -133,19 +131,20 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str for ( mu=0; mu<4; mu++ ) { c->max_length[mu] = buffer_size; #ifdef HAVE_TM1p1 - MALLOC( c->buffer[2*mu].vector_buffer, complex_PRECISION, 2*buffer_size ); - MALLOC( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, 2*buffer_size ); + MALLOC( c->buffer[2*mu], complex_PRECISION, 2*buffer_size ); + MALLOC( c->buffer[2*mu+1], complex_PRECISION, 2*buffer_size ); #else - MALLOC( c->buffer[2*mu].vector_buffer, complex_PRECISION, buffer_size ); - MALLOC( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, buffer_size ); + MALLOC( c->buffer[2*mu], complex_PRECISION, buffer_size ); + MALLOC( c->buffer[2*mu+1], complex_PRECISION, buffer_size ); #endif } } - - if ( l->vbuf_PRECISION[8].vector_buffer == NULL ) { + if ( l->vbuf_PRECISION[8].vector_buffer == NULL ) { #ifdef HAVE_TM1p1 + //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 2, l, no_threading); MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); #else + //vector_PRECISION_alloc( l->vbuf_PRECISION[8], _ORDINARY, 1, l, no_threading); MALLOC( l->vbuf_PRECISION[8]->vector_buffer, complex_PRECISION, l->vector_size ); #endif } @@ -157,17 +156,11 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) { int mu; for ( mu=0; mu<4; mu++ ) { - FREE( c->buffer[2*mu].vector_buffer, complex_PRECISION, c->max_length[mu] ); - FREE( c->buffer[2*mu+1].vector_buffer, complex_PRECISION, c->max_length[mu] ); - } - - if ( l->vbuf_PRECISION[8].vector_buffer != NULL ) { -#ifdef HAVE_TM1p1 - FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); -#else - FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size ); -#endif + FREE( c->buffer[2*mu], complex_PRECISION, c->max_length[mu] ); + FREE( c->buffer[2*mu+1], complex_PRECISION, c->max_length[mu] ); } + if ( l->vbuf_PRECISION[8].vector_buffer != NULL ) + vector_PRECISION_free( &(l->vbuf_PRECISION[8]), l, no_threading); } @@ -192,7 +185,7 @@ void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir int i, j, *table=NULL, mu_dir = 2*mu-MIN(dir,0), offset = c->offset, length[2] = {0,0}, comm_start = 0, table_start = 0; - vector_PRECISION buffer, phi_pt; + buffer_PRECISION buffer, phi_pt; if ( amount == _FULL_SYSTEM ) { length[0] = (c->num_boundary_sites[2*mu])*offset; @@ -238,16 +231,16 @@ void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir // afterwards (in ghost_wait) the data has to be distributed onto the correct sites // touching the respective boundary in -mu direction - phi_pt.vector_buffer = phi + comm_start; + phi_pt = phi + comm_start; if ( length[1] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Irecv( buffer.vector_buffer, length[1], MPI_COMPLEX_PRECISION, + MPI_Irecv( buffer, length[1], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu+1], 2*mu, g.comm_cart, &(c->rreqs[2*mu]) ); PROF_PRECISION_STOP( _OP_COMM, 1 ); } if ( length[0] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Isend( phi_pt.vector_buffer, length[0], MPI_COMPLEX_PRECISION, + MPI_Isend( phi_pt, length[0], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu], 2*mu, g.comm_cart, &(c->sreqs[2*mu]) ); PROF_PRECISION_STOP( _OP_COMM, 0 ); } @@ -261,25 +254,25 @@ void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir table = c->boundary_table[2*mu+1]+table_start; for ( j=0; jbuffer[mu_dir]; - phi_pt.vector_buffer = phi + comm_start; + phi_pt = phi + comm_start; if ( length[0] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Irecv( phi_pt.vector_buffer, length[0], MPI_COMPLEX_PRECISION, + MPI_Irecv( phi_pt, length[0], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) ); PROF_PRECISION_STOP( _OP_COMM, 1 ); } if ( length[1] > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Isend( buffer.vector_buffer, length[1], MPI_COMPLEX_PRECISION, + MPI_Isend( buffer, length[1], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) ); PROF_PRECISION_STOP( _OP_COMM, 0 ); } @@ -295,7 +288,7 @@ void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, if( l->global_splitting[mu] > 1 ) { int mu_dir = 2*mu-MIN(dir,0); int i, j, *table, offset = c->offset, length[2]={0,0}, table_start = 0; - vector_PRECISION buffer, phi_pt; + buffer_PRECISION buffer, phi_pt; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) @@ -338,21 +331,21 @@ void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, if ( l->depth == 0 ) { for ( j=0; jnum_lattice_site_var; length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var; @@ -399,28 +392,28 @@ void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, ASSERT( c->in_use[mu_dir] == 0 ); c->in_use[mu_dir] = 1; - recv_pt.vector_buffer = phi->vector_buffer + comm_start; + recv_pt = phi->vector_buffer + comm_start; if ( length > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Irecv( recv_pt.vector_buffer, length, MPI_COMPLEX_PRECISION, + MPI_Irecv( recv_pt, length, MPI_COMPLEX_PRECISION, l->neighbor_rank[mu_dir], mu_dir, g.comm_cart, &(c->rreqs[mu_dir]) ); PROF_PRECISION_STOP( _OP_COMM, 1 ); } table = c->boundary_table[inv_mu_dir]; for ( j=0; jvector_buffer + table[j]*site_var; + phi_pt = phi->vector_buffer + table[j]*site_var; for ( i=0; ibuffer[mu_dir]; if ( length > 0 ) { PROF_PRECISION_START( _OP_COMM ); - MPI_Isend( buffer.vector_buffer, length, MPI_COMPLEX_PRECISION, + MPI_Isend( buffer, length, MPI_COMPLEX_PRECISION, l->neighbor_rank[inv_mu_dir], mu_dir, g.comm_cart, &(c->sreqs[mu_dir]) ); PROF_PRECISION_STOP( _OP_COMM, 0 ); } diff --git a/src/init_generic.c b/src/init_generic.c index b59a9f1..84a5814 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -94,23 +94,23 @@ double prof_PRECISION_print( level_struct *l ) { return flop; } - void fine_level_PRECISION_alloc( level_struct *l ) { int n = 8; - vector_PRECISION_init(&(l->vbuf_PRECISION[0])); #ifdef HAVE_TM1p1 - MALLOC( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i].vector_buffer = l->vbuf_PRECISION[0].vector_buffer + 2*i*l->vector_size; - MALLOC( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->inner_vector_size ); - l->p_PRECISION.x.vector_buffer = l->p_PRECISION.b.vector_buffer + 2*l->inner_vector_size; + for ( int i=0; ivbuf_PRECISION[i]) ); + vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, 2, l, no_threading ); + } + vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 2, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 2, l, no_threading ); #else - MALLOC( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i].vector_buffer = l->vbuf_PRECISION[0].vector_buffer + i*l->vector_size; - MALLOC( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); - l->p_PRECISION.x.vector_buffer = l->p_PRECISION.b.vector_buffer + l->inner_vector_size; + for ( int i=0; ivbuf_PRECISION[i]) ); + vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, 1, l, no_threading ); + } + vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 1, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 1, l, no_threading ); #endif } @@ -118,19 +118,10 @@ void fine_level_PRECISION_alloc( level_struct *l ) { void fine_level_PRECISION_free( level_struct *l ) { int n = 8; -#ifdef HAVE_TM1p1 - FREE( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i])); - FREE( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->inner_vector_size ); - vector_PRECISION_init(&(l->p_PRECISION.x)); -#else - FREE( l->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i])); - FREE( l->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->inner_vector_size ); - vector_PRECISION_init(&(l->p_PRECISION.x)); -#endif + for ( int i=0; ivbuf_PRECISION[i]), l, no_threading ); + vector_PRECISION_free( &(l->p_PRECISION.b), l, no_threading ); + vector_PRECISION_free( &(l->p_PRECISION.x), l, no_threading ); } @@ -159,12 +150,13 @@ void next_level_PRECISION_setup( level_struct *l ) { &(l->next_level->p_PRECISION), l->next_level ); } else { vector_PRECISION_init(&(l->next_level->p_PRECISION.b)); + vector_PRECISION_init(&(l->next_level->p_PRECISION.x)); #ifdef HAVE_TM1p1 - MALLOC( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->next_level->vector_size ); - l->next_level->p_PRECISION.x.vector_buffer = l->next_level->p_PRECISION.b.vector_buffer + 2*l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 2, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 2, l->next_level, no_threading ); #else - MALLOC( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->next_level->vector_size ); - l->next_level->p_PRECISION.x.vector_buffer = l->next_level->p_PRECISION.b.vector_buffer + l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 1, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 1, l->next_level, no_threading ); #endif l->next_level->p_PRECISION.v_start = 0; l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size; @@ -172,16 +164,14 @@ void next_level_PRECISION_setup( level_struct *l ) { } int i, n = (l->next_level->level>0)?6:4; - vector_PRECISION_init(&(l->next_level->vbuf_PRECISION[0])); + for ( i=0; inext_level->vbuf_PRECISION[i]) ); #ifdef HAVE_TM1p1 - MALLOC( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->next_level->vector_size ); - for ( i=1; inext_level->vbuf_PRECISION[i].vector_buffer = l->next_level->vbuf_PRECISION[0].vector_buffer + 2*i*l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 2, l->next_level, no_threading ); #else - MALLOC( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->next_level->vector_size ); - for ( i=1; inext_level->vbuf_PRECISION[i].vector_buffer = l->next_level->vbuf_PRECISION[0].vector_buffer + i*l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 1, l->next_level, no_threading ); #endif + } } } @@ -194,21 +184,13 @@ void next_level_PRECISION_free( level_struct *l ) { if ( ( l->level == 1 && !l->next_level->idle ) || g.kcycle ) { fgmres_PRECISION_struct_free( &(l->next_level->p_PRECISION), l->next_level ); } else { -#ifdef HAVE_TM1p1 - FREE( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*2*l->next_level->vector_size ); -#else - FREE( l->next_level->p_PRECISION.b.vector_buffer, complex_PRECISION, 2*l->next_level->vector_size ); -#endif + vector_PRECISION_free( &(l->next_level->p_PRECISION.b), l->next_level, no_threading ); + vector_PRECISION_free( &(l->next_level->p_PRECISION.x), l->next_level, no_threading ); } int i, n = (l->next_level->level>0)?6:4; - for ( i=1; inext_level->vbuf_PRECISION[i])); -#ifdef HAVE_TM1p1 - FREE( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*n*l->next_level->vector_size ); -#else - FREE( l->next_level->vbuf_PRECISION[0].vector_buffer, complex_PRECISION, n*l->next_level->vector_size ); -#endif + for ( i=0; inext_level->vbuf_PRECISION[i]), l->next_level, no_threading ); coarsening_index_table_PRECISION_free( &(l->is_PRECISION), l ); } @@ -219,7 +201,7 @@ void next_level_PRECISION_free( level_struct *l ) { void level_PRECISION_init( level_struct *l ) { for ( int i=0; i<9; i++ ) - vector_PRECISION_init(&(l->vbuf_PRECISION[i])); + vector_PRECISION_init( &(l->vbuf_PRECISION[i]) ); operator_PRECISION_init( &(l->op_PRECISION) ); operator_PRECISION_init( &(l->oe_op_PRECISION) ); @@ -238,8 +220,8 @@ void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading ) vector_PRECISION_init(&v2); double t0=0, t1=0; - PUBLIC_MALLOC( v1.vector_buffer, complex_PRECISION, l->inner_vector_size ); - PUBLIC_MALLOC( v2.vector_buffer, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_alloc(&v1, _INNER, 1, l, threading); + vector_PRECISION_alloc(&v2, _INNER, 1, l, threading); START_LOCKED_MASTER(threading) vector_PRECISION_define_random( &v2, 0, l->inner_vector_size, l ); @@ -257,7 +239,7 @@ void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading ) END_MASTER(threading) START_LOCKED_MASTER(threading) - PUBLIC_FREE( v1.vector_buffer, complex_PRECISION, l->inner_vector_size ); - PUBLIC_FREE( v2.vector_buffer, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_free(&v1, l, threading); + vector_PRECISION_free(&v2, l, threading); END_LOCKED_MASTER(threading) } diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c index f6e853e..49770df 100644 --- a/src/interpolation_generic.c +++ b/src/interpolation_generic.c @@ -30,16 +30,13 @@ void interpolation_PRECISION_alloc( level_struct *l ) { MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n ); - vector_PRECISION_init(&(l->is_PRECISION.interpolation[0])); - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size, 64 ); - for ( k=1; kis_PRECISION.interpolation[k].vector_buffer = l->is_PRECISION.interpolation[0].vector_buffer + k*l->vector_size; - MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); - vector_PRECISION_init(&(l->is_PRECISION.test_vector[0])); - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size, 64 ); - for ( k=1; kis_PRECISION.test_vector[k].vector_buffer = l->is_PRECISION.test_vector[0].vector_buffer + k*l->inner_vector_size; + for ( k=0; kis_PRECISION.interpolation[k])); + vector_PRECISION_alloc(&(l->is_PRECISION.interpolation[k]), _ORDINARY, 1, l, no_threading ); + vector_PRECISION_init(&(l->is_PRECISION.test_vector[k])); + vector_PRECISION_alloc(&(l->is_PRECISION.test_vector[k]), _INNER, 1, l, no_threading ); } + MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); } @@ -61,10 +58,12 @@ void interpolation_PRECISION_free( level_struct *l ) { int n = l->num_eig_vect; - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size ); + for (int k=0; kis_PRECISION.interpolation[k]), l, no_threading ); + vector_PRECISION_free(&(l->is_PRECISION.test_vector[k]), l, no_threading ); + } FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size ); FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); FREE( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 41c515c..869cc01 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -363,16 +363,16 @@ void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int vector_PRECISION v_tmp, *W_tmp = NULL; complex_PRECISION ip[k], ip_buffer[2*k]; - vector_PRECISION_init(&v_tmp); + vector_PRECISION_init( &v_tmp ); - MALLOC( v_tmp.vector_buffer, complex_PRECISION, l->inner_vector_size ); - vector_PRECISION_define(&v_tmp, 0, 0, l->inner_vector_size, l ); + vector_PRECISION_alloc( &v_tmp, _INNER, 1, l, no_threading ); + vector_PRECISION_define( &v_tmp, 0, 0, l->inner_vector_size, l ); MALLOC( W_tmp, vector_PRECISION, k ); - vector_PRECISION_init(&W_tmp[0]); - MALLOC( W_tmp[0].vector_buffer, complex_PRECISION, k*l->inner_vector_size ); - for ( j = 1; jinner_vector_size; + for ( j = 0; jinner_vector_size, l ); @@ -394,8 +394,10 @@ void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int else vector_PRECISION_copy( z, &v_tmp, 0, l->inner_vector_size, l ); - FREE( v_tmp.vector_buffer, complex_PRECISION, l->inner_vector_size ); - FREE( W_tmp[0].vector_buffer, complex_PRECISION, k*l->inner_vector_size ); + vector_PRECISION_free( &v_tmp, l, no_threading ); + for ( j = 0; jprnZ = op->prnT + j; op->prnY = op->prnZ + j; op->prnX = op->prnY + j; op->prpT = op->prnX + j; op->prpZ = op->prpT + j; op->prpY = op->prpZ + j; op->prpX = op->prpY + j; MALLOC( op->buffer, vector_PRECISION, 2 ); - vector_PRECISION_init(&(op->buffer[0])); + for(int i=0; i<2; i++ ){ + vector_PRECISION_init( &(op->buffer[i]) ); #ifdef HAVE_TM1p1 - MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 4*l->vector_size ); - op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + 2*l->vector_size; + vector_PRECISION_alloc( &(op->buffer[i]), _ORDINARY, 2, l, no_threading ); #else - MALLOC( op->buffer[0].vector_buffer, complex_PRECISION, 2*l->vector_size ); - op->buffer[1].vector_buffer = op->buffer[0].vector_buffer + l->vector_size; + vector_PRECISION_alloc( &(op->buffer[i]), _ORDINARY, 1, l, no_threading ); #endif + } ghost_alloc_PRECISION( 0, &(op->c), l ); ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ; l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var; @@ -962,12 +962,14 @@ void oddeven_free_PRECISION( level_struct *l ) { FREE( l->oe_op_PRECISION.c.boundary_table[2*mu], int, bs ); l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL; } - + + for(int i=0; i<2; i++ ){ #ifdef HAVE_TM1p1 - FREE( l->oe_op_PRECISION.buffer[0].vector_buffer, complex_PRECISION, 4*l->vector_size ); + vector_PRECISION_free( &(l->oe_op_PRECISION.buffer[i]), l, no_threading ); #else - FREE( l->oe_op_PRECISION.buffer[0].vector_buffer, complex_PRECISION, 2*l->vector_size ); + vector_PRECISION_free( &(l->oe_op_PRECISION.buffer[i]), l, no_threading ); #endif + } FREE( l->oe_op_PRECISION.buffer, vector_PRECISION, 2 ); #ifdef HAVE_TM1p1 FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, 2*(l->num_lattice_site_var/2)*l->num_lattice_sites*8 ); @@ -2571,84 +2573,72 @@ void oddeven_PRECISION_test( level_struct *l ) { * - Compare solutions ( Difference should be close to 0 ). *********************************************************************************/ - vector_double d1, d2, d3; - vector_PRECISION f1, f2, f3, f4, f5; + vector_double d[3]; + vector_PRECISION f[5]; double diff; - vector_double_init(&d1); - vector_double_init(&d2); - vector_double_init(&d3); - - vector_PRECISION_init(&f1); - vector_PRECISION_init(&f2); - vector_PRECISION_init(&f3); - vector_PRECISION_init(&f4); - vector_PRECISION_init(&f5); - - MALLOC( d1.vector_buffer, complex_double, l->inner_vector_size ); - MALLOC( d2.vector_buffer, complex_double, l->inner_vector_size ); - MALLOC( d3.vector_buffer, complex_double, l->inner_vector_size ); - MALLOC( f1.vector_buffer, complex_PRECISION, l->inner_vector_size ); - MALLOC( f2.vector_buffer, complex_PRECISION, l->inner_vector_size ); - MALLOC( f3.vector_buffer, complex_PRECISION, l->inner_vector_size ); - MALLOC( f4.vector_buffer, complex_PRECISION, l->inner_vector_size ); - MALLOC( f5.vector_buffer, complex_PRECISION, l->inner_vector_size ); - - vector_double_define_random( &d1, 0, l->inner_vector_size, l ); - serial_to_oddeven_PRECISION( &f1, &d1, l, no_threading ); + for(int i=0; i<3; i++){ + vector_double_init( &d[i] ); + vector_double_alloc( &d[i], _INNER, 1, l, no_threading ); + } + + for(int i=0; i<5; i++){ + vector_PRECISION_init( &f[i] ); + vector_PRECISION_alloc( &f[i], _INNER, 1, l, no_threading ); + } + + vector_double_define_random( &d[0], 0, l->inner_vector_size, l ); + serial_to_oddeven_PRECISION( &f[0], &d[0], l, no_threading ); - diag_ee_PRECISION( &f2, &f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); - diag_oo_PRECISION( &f2, &f1, &(l->oe_op_PRECISION), l, no_threading ); + diag_ee_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); + diag_oo_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), l, no_threading ); - hopping_term_PRECISION( &f2, &f1, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); + hopping_term_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); - d_plus_clover_double( &d2, &d1, &(g.op_double), l, no_threading ); - oddeven_to_serial_PRECISION( &d1, &f2, l, no_threading ); + d_plus_clover_double( &d[1], &d[0], &(g.op_double), l, no_threading ); + oddeven_to_serial_PRECISION( &d[0], &f[1], l, no_threading ); - vector_double_minus( &d3, &d1, &d2, 0, l->num_inner_lattice_sites, l ); - diff = global_norm_double( &d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( &d1, 0, l->num_inner_lattice_sites, l, no_threading ); + vector_double_minus( &d[2], &d[0], &d[1], 0, l->num_inner_lattice_sites, l ); + diff = global_norm_double( &d[2], 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( &d[0], 0, l->num_inner_lattice_sites, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout: %le\n", l->depth, diff ); // -------------- - vector_PRECISION_copy( &f4, &f1, 0, l->inner_vector_size, l ); - diag_oo_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), l, no_threading ); - diag_oo_inv_PRECISION( &f4, &f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - vector_PRECISION_minus( &f4, &f4, &f1, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &f[3], &f[0], 0, l->inner_vector_size, l ); + diag_oo_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading ); + diag_oo_inv_PRECISION( &f[3], &f[2], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + vector_PRECISION_minus( &f[3], &f[3], &f[0], 0, l->inner_vector_size, l ); - diff = (PRECISION) (global_norm_PRECISION( &f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f1, 0, l->inner_vector_size, l, no_threading )); + diff = (PRECISION) (global_norm_PRECISION( &f[3], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f[0], 0, l->inner_vector_size, l, no_threading )); test0_PRECISION("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, diff ); // transformation part - vector_PRECISION_copy( &f4, &f1, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &f[3], &f[0], 0, l->inner_vector_size, l ); // even to odd // set odd part of f3 to 0. - vector_PRECISION_define( &f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_define( &f[2], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - hopping_term_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); - diag_oo_inv_PRECISION( &f5, &f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - vector_PRECISION_plus( &f4, &f4, &f5, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + hopping_term_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); + diag_oo_inv_PRECISION( &f[4], &f[2], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + vector_PRECISION_plus( &f[3], &f[3], &f[4], l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); // block diagonal part - apply_schur_complement_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), l, no_threading ); - diag_oo_PRECISION( &f3, &f4, &(l->oe_op_PRECISION), l, no_threading ); + apply_schur_complement_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading ); + diag_oo_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading ); // back transformation part - diag_oo_inv_PRECISION( &f5, &f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - hopping_term_PRECISION( &f3, &f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + diag_oo_inv_PRECISION( &f[4], &f[3], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + hopping_term_PRECISION( &f[2], &f[4], &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); - vector_PRECISION_minus( &f1, &f2, &f3, 0, l->inner_vector_size, l ); - diff = (PRECISION) (global_norm_PRECISION( &f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f2, 0, l->inner_vector_size, l, no_threading )); + vector_PRECISION_minus( &f[0], &f[1], &f[2], 0, l->inner_vector_size, l ); + diff = (PRECISION) (global_norm_PRECISION( &f[0], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f[1], 0, l->inner_vector_size, l, no_threading )); test0_PRECISION("depth: %d, correctness of odd even schur complement: %le\n", l->depth, diff ); - - FREE( d1.vector_buffer, complex_double, l->inner_vector_size ); - FREE( d2.vector_buffer, complex_double, l->inner_vector_size ); - FREE( d3.vector_buffer, complex_double, l->inner_vector_size ); - FREE( f1.vector_buffer, complex_PRECISION, l->inner_vector_size ); - FREE( f2.vector_buffer, complex_PRECISION, l->inner_vector_size ); - FREE( f3.vector_buffer, complex_PRECISION, l->inner_vector_size ); - FREE( f4.vector_buffer, complex_PRECISION, l->inner_vector_size ); - FREE( f5.vector_buffer, complex_PRECISION, l->inner_vector_size ); + + for(int i=0; i<3; i++) + vector_double_free( &d[i], l, no_threading ); + + for(int i=0; i<5; i++) + vector_PRECISION_free( &f[i], l, no_threading ); } diff --git a/src/operator_generic.c b/src/operator_generic.c index 408c83f..28bb595 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -58,7 +58,7 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { for ( int i=0; i<8; i++ ) { op->c.boundary_table[i] = NULL; - vector_PRECISION_init(&(op->c.buffer[i])); + op->c.buffer[i] = NULL; op->c.in_use[i] = 0; } op->c.comm = 1; @@ -393,45 +393,46 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc int ivs = l->inner_vector_size; double diff; - vector_double vd1, vd2, vd3, vd4; - vector_PRECISION vp1, vp2; + vector_double vd[4]; + vector_PRECISION vp[2]; - vector_double_init(&vd1); - vector_PRECISION_init(&vp1); - - PUBLIC_MALLOC( vd1.vector_buffer, complex_double, 4*ivs ); - PUBLIC_MALLOC( vp1.vector_buffer, complex_PRECISION, 2*ivs ); - - vd2.vector_buffer = vd1.vector_buffer + ivs; vd3.vector_buffer = vd2.vector_buffer + ivs; - vd4.vector_buffer = vd3.vector_buffer + ivs; vp2.vector_buffer = vp1.vector_buffer + ivs; + for(int i=0; i<4; i++){ + vector_double_init( &vd[i] ); + vector_double_alloc( &vd[i], _INNER, 1, l, threading ); + } + + for(int i=0; i<2; i++){ + vector_PRECISION_init( &vp[i] ); + vector_PRECISION_alloc( &vp[i], _INNER, 1, l, threading ); + } START_LOCKED_MASTER(threading) - vector_double_define_random( &vd1, 0, l->inner_vector_size, l ); - apply_operator_double( &vd2, &vd1, &(g.p), l, no_threading ); + vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); + apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading ); - trans_PRECISION( &vp1, &vd1, op->translation_table, l, no_threading ); - apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( &vd3, &vp2, op->translation_table, l, no_threading ); + trans_PRECISION( &vp[0], &vd[0], op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); - vector_double_minus( &vd4, &vd3, &vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( &vd4, 0, ivs, l, no_threading )/ - global_norm_double( &vd3, 0, ivs, l, no_threading ); + vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/ + global_norm_double( &vd[2], 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - apply_operator_PRECISION( &vp2, &vp1, &(l->p_PRECISION), l, threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) START_LOCKED_MASTER(threading) - trans_back_PRECISION( &vd3, &vp2, op->translation_table, l, no_threading ); - vector_double_minus( &vd4, &vd3, &vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( &vd4, 0, ivs, l, no_threading ) / - global_norm_double( &vd3, 0, ivs, l, no_threading ); + trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); + vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vd[3], 0, ivs, l, no_threading ) / + global_norm_double( &vd[2], 0, ivs, l, no_threading ); if ( diff > EPS_PRECISION ) printf0("\x1b[31m"); @@ -442,9 +443,14 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc END_LOCKED_MASTER(threading) } - - PUBLIC_FREE( vd1.vector_buffer, complex_double, 4*ivs ); - PUBLIC_FREE( vp1.vector_buffer, complex_PRECISION, 2*ivs ); + + for(int i=0; i<4; i++){ + vector_double_free( &vd[i], l, threading ); + } + + for(int i=0; i<2; i++){ + vector_PRECISION_free( &vp[i], l, threading ); + } START_LOCKED_MASTER(threading) if ( g.method >=4 && g.odd_even ) diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index 27cf865..6cae49f 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -69,7 +69,6 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) { s->num_colors = 0; } - void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { int i, j, n, mu, nu, *bl = l->block_lattice; @@ -141,18 +140,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { MALLOC( s->block, block_struct, s->num_blocks ); int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int nvec = 1; #ifdef HAVE_TM1p1 svs *= 2; vs *= 2; + nvec = 2; #endif if ( l->depth == 0 ) { - vector_PRECISION_init(&(s->oe_buf[0])); - MALLOC( s->oe_buf[0].vector_buffer, complex_PRECISION, 4*vs ); - s->oe_buf[1].vector_buffer = s->oe_buf[0].vector_buffer + vs; - s->oe_buf[2].vector_buffer = s->oe_buf[1].vector_buffer + vs; - s->oe_buf[3].vector_buffer = s->oe_buf[2].vector_buffer + vs; + for ( i=0; i<4; i++ ) { + vector_PRECISION_init( &(s->oe_buf[i]) ); + vector_PRECISION_alloc( &(s->oe_buf[i]), _INNER, nvec, l, no_threading ); + } } n = 0; @@ -173,19 +173,25 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { s->block[i].bt = NULL; MALLOC( s->block[i].bt, int, n ); } - vector_PRECISION_init(&(s->buf1)); - MALLOC( s->buf1.vector_buffer, complex_PRECISION, vs+3*svs ); - s->buf2.vector_buffer = s->buf1.vector_buffer + vs; - s->buf3.vector_buffer = s->buf2.vector_buffer + svs; - s->buf4.vector_buffer = s->buf3.vector_buffer + svs; + vector_PRECISION_init( &(s->buf1) ); + vector_PRECISION_init( &(s->buf2) ); + vector_PRECISION_init( &(s->buf3) ); + vector_PRECISION_init( &(s->buf4) ); + + vector_PRECISION_alloc( &(s->buf1), (l->depth==0)?_INNER:_ORDINARY, nvec, l, no_threading ); + vector_PRECISION_alloc( &(s->buf2), _SCHWARZ, nvec, l, no_threading ); + vector_PRECISION_alloc( &(s->buf3), _SCHWARZ, nvec, l, no_threading ); + vector_PRECISION_alloc( &(s->buf4), _SCHWARZ, nvec, l, no_threading ); if ( g.method == 1 ){ - vector_PRECISION_init(&(s->buf5)); - MALLOC( s->buf5.vector_buffer, complex_PRECISION, svs ); + vector_PRECISION_init( &(s->buf5) ); + vector_PRECISION_alloc( &(s->buf5), _SCHWARZ, nvec, l, no_threading ); + } + + for ( i=0; i<2; i++ ) { + vector_PRECISION_init( &(l->sbuf_PRECISION[i]) ); + vector_PRECISION_alloc( &(l->sbuf_PRECISION[i]), (l->depth==0)?_INNER:_ORDINARY, nvec, l, no_threading ); } - vector_PRECISION_init(&(l->sbuf_PRECISION[0])); - MALLOC( l->sbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*vs ); - l->sbuf_PRECISION[1].vector_buffer = l->sbuf_PRECISION[0].vector_buffer + vs; // these buffers are introduced to make local_minres_PRECISION thread-safe MALLOC( s->local_minres_buffer[0], complex_PRECISION, svs ); @@ -261,27 +267,22 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { svs *= 2; vs *= 2; #endif - if ( l->depth == 0 ) { - vector_PRECISION_init(&(s->oe_buf[1])); - vector_PRECISION_init(&(s->oe_buf[2])); - vector_PRECISION_init(&(s->oe_buf[3])); - FREE( s->oe_buf[0].vector_buffer, complex_PRECISION, 4*vs ); - vector_PRECISION_init(&(s->oe_buf[0])); - } + if ( l->depth == 0 ) + for ( i=0; i<4; i++ ) + vector_PRECISION_free( &(s->oe_buf[i]), l, no_threading ); - - FREE( s->buf1.vector_buffer, complex_PRECISION, vs+3*svs ); - vector_PRECISION_init(&(s->buf2)); - vector_PRECISION_init(&(s->buf3)); - vector_PRECISION_init(&(s->buf4)); + vector_PRECISION_free( &(s->buf1), l, no_threading ); + vector_PRECISION_free( &(s->buf2), l, no_threading ); + vector_PRECISION_free( &(s->buf3), l, no_threading ); + vector_PRECISION_free( &(s->buf4), l, no_threading ); if ( g.method == 1 ) - FREE( s->buf5.vector_buffer, complex_PRECISION, svs ); + vector_PRECISION_free( &(s->buf5), l, no_threading ); operator_PRECISION_free( &(s->op), _SCHWARZ, l ); - FREE( l->sbuf_PRECISION[0].vector_buffer, complex_PRECISION, 2*vs ); - vector_PRECISION_init(&(l->sbuf_PRECISION[1])); + for ( i=0; i<2; i++ ) + vector_PRECISION_free( &(l->sbuf_PRECISION[i]), l, no_threading ); FREE( s->local_minres_buffer[0], complex_PRECISION, svs ); FREE( s->local_minres_buffer[1], complex_PRECISION, svs ); @@ -2111,10 +2112,9 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D vector_PRECISION true_r; vector_PRECISION_init(&true_r); - PUBLIC_MALLOC( true_r.vector_buffer, complex_PRECISION, l->vector_size ); + vector_PRECISION_alloc( &true_r, _ORDINARY, 1, l, threading ); vector_PRECISION_define( &true_r, 0, 0, l->inner_vector_size, l ); - if ( D_phi == NULL ) { for ( mu=0; mu<4; mu++ ) { ghost_update_PRECISION( x, mu, +1, &(s->op.c), l ); @@ -2138,7 +2138,7 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); - PUBLIC_FREE( true_r.vector_buffer, complex_PRECISION, l->vector_size ); + vector_PRECISION_free( &true_r, l, threading ); END_LOCKED_MASTER(threading) #endif } @@ -2222,9 +2222,7 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l START_UNTHREADED_FUNCTION(threading) int mu, i, nb = s->num_blocks; - int svs = l->schwarz_vector_size; int ivs = l->inner_vector_size; - int vs = l->vector_size; void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*op)() = (l->depth==0)?d_plus_clover_PRECISION:apply_coarse_operator_PRECISION; @@ -2233,13 +2231,13 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l vector_PRECISION v1, v2, v3; PRECISION diff; - vector_PRECISION_init(&v1); - vector_PRECISION_init(&v2); - vector_PRECISION_init(&v3); + vector_PRECISION_init( &v1 ); + vector_PRECISION_init( &v2 ); + vector_PRECISION_init( &v3 ); - MALLOC( v1.vector_buffer, complex_PRECISION, svs ); - MALLOC( v2.vector_buffer, complex_PRECISION, vs ); - MALLOC( v3.vector_buffer, complex_PRECISION, vs ); + vector_PRECISION_alloc( &v1, _SCHWARZ, 1, l, no_threading ); + vector_PRECISION_alloc( &v2, _ORDINARY, 1, l, no_threading ); + vector_PRECISION_alloc( &v3, _ORDINARY, 1, l, no_threading ); vector_PRECISION_define_random( &v1, 0, ivs, l ); @@ -2265,10 +2263,10 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l global_norm_PRECISION( &v2, 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of local residual vector: %le\n", l->depth, diff ); - - FREE( v1.vector_buffer, complex_PRECISION, l->schwarz_vector_size ); - FREE( v2.vector_buffer, complex_PRECISION, l->vector_size ); - FREE( v3.vector_buffer, complex_PRECISION, l->vector_size ); + + vector_PRECISION_free( &v1, l, no_threading ); + vector_PRECISION_free( &v2, l, no_threading ); + vector_PRECISION_free( &v3, l, no_threading ); END_UNTHREADED_FUNCTION(threading) } diff --git a/src/setup_generic.c b/src/setup_generic.c index ef1e967..63ee4b5 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -145,7 +145,7 @@ void read_tv_from_file_PRECISION( level_struct *l, struct Thread *threading ) { vector_double tmp; vector_double_init(&tmp); - MALLOC( tmp.vector_buffer, complex_double, l->inner_vector_size ); + vector_double_alloc( &tmp, _INNER, 1, l, no_threading ); for ( i=0; iis_PRECISION.test_vector[i]), &tmp, l->s_PRECISION.op.translation_table, l, no_threading ); } - FREE( tmp.vector_buffer, complex_double, l->inner_vector_size ); + vector_double_free( &tmp, l, no_threading ); END_LOCKED_MASTER(threading) @@ -208,11 +208,12 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T START_MASTER(threading) vector_PRECISION_init(&buffer[0]); END_MASTER(threading) - PUBLIC_MALLOC( buffer[0].vector_buffer, complex_PRECISION, l->vector_size*3 ); START_MASTER(threading) - for( i=1; i<3; i++) - buffer[i].vector_buffer = buffer[0].vector_buffer + l->vector_size*i; + for( i=0; i<3; i++){ + vector_PRECISION_init( &buffer[i] ); + vector_PRECISION_alloc( &buffer[i], _ORDINARY, 1, l, threading ); + } if ( g.print > 0 ) printf0("initial definition --- depth: %d\n", l->depth ); #ifdef DEBUG if ( g.print > 0 ) { printf0("\033[0;42m\033[1;37m|"); fflush(0); } @@ -242,7 +243,9 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T #endif } - PUBLIC_FREE( buffer[0].vector_buffer, complex_PRECISION, l->vector_size*3 ); + for( i=0; i<3; i++){ + vector_PRECISION_free( &buffer[i], l, threading ); + } PUBLIC_FREE( buffer, vector_PRECISION, 3 ); for ( k=0; kvector_size ); + vector_PRECISION_init( &buf1 ); + vector_PRECISION_alloc( &buf1, _ORDINARY, 1, l, no_threading ); fgmres_PRECISION_struct_init( &gmres ); fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, apply_coarse_operator_PRECISION, &gmres, l->next_level ); @@ -428,7 +431,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s inv_iter_2lvl_extension_setup_PRECISION( setup_iter, l->next_level, threading ); START_LOCKED_MASTER(threading) - FREE( buf1.vector_buffer, complex_PRECISION, l->vector_size ); + vector_PRECISION_free( &buf1, l, no_threading ); fgmres_PRECISION_struct_free( &gmres, l ); END_LOCKED_MASTER(threading) } @@ -471,8 +474,8 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre set_kcycle_tol_PRECISION( g.coarse_tol, l ); END_LOCKED_MASTER(threading) SYNC_MASTER_TO_ALL(threading) - - PUBLIC_MALLOC( v_buf.vector_buffer, complex_PRECISION, l->vector_size ); + + vector_PRECISION_alloc( &v_buf, _ORDINARY, 1, l, threading ); if ( !l->idle ) { for ( int j=0; jsetup_iter))), l->next_level, threading ); } } - - PUBLIC_FREE( v_buf.vector_buffer, complex_PRECISION, l->vector_size ); + + vector_PRECISION_free( &v_buf, l, threading ); PUBLIC_FREE( buffer, complex_PRECISION, 2*l->num_eig_vect ); if ( l->depth == 0 ) { diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c index 122eb10..876055f 100644 --- a/src/sse_interpolation_generic.c +++ b/src/sse_interpolation_generic.c @@ -29,22 +29,21 @@ void interpolation_PRECISION_alloc( level_struct *l ) { MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); - + #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n ); - l->is_PRECISION.interpolation[0].vector_buffer = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size, 128 ); - for ( k=1; kis_PRECISION.interpolation[k].vector_buffer = l->is_PRECISION.interpolation[0].vector_buffer + k*l->vector_size; + for ( k=0; kis_PRECISION.interpolation[k]) ); + vector_PRECISION_alloc( &(l->is_PRECISION.interpolation[k]), _ORDINARY, 1, l, no_threading ); + } #endif // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 ); - vector_PRECISION_init(&(l->is_PRECISION.test_vector[0])); - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size, 128 ); - for ( k=1; kis_PRECISION.test_vector[k].vector_buffer = l->is_PRECISION.test_vector[0].vector_buffer + k*l->inner_vector_size; + for ( k=0; kis_PRECISION.test_vector[k]) ); + vector_PRECISION_alloc( &(l->is_PRECISION.test_vector[k]), _INNER, 1, l, no_threading ); } } @@ -66,12 +65,16 @@ void interpolation_PRECISION_dummy_free( level_struct *l ) { void interpolation_PRECISION_free( level_struct *l ) { int n = l->num_eig_vect; - - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0].vector_buffer, complex_PRECISION, n*l->inner_vector_size ); + + for (int k=0; kis_PRECISION.test_vector[k]), l, no_threading ); + } FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); #ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0].vector_buffer, complex_PRECISION, n*l->vector_size ); + for (int k=0; kis_PRECISION.interpolation[k]), l, no_threading ); + } FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); #endif FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); diff --git a/src/top_level.c b/src/top_level.c index e0061cc..cd1d672 100644 --- a/src/top_level.c +++ b/src/top_level.c @@ -125,8 +125,8 @@ void solve_driver( level_struct *l, struct Thread *threading ) { vector_double solution, source; double minus_twisted_bc[4], norm; - vector_double_init(&solution); - vector_double_init(&source); + vector_double_init( &solution ); + vector_double_init( &source ); if(g.bc==2) for ( int i=0; i<4; i++ ) @@ -138,8 +138,8 @@ void solve_driver( level_struct *l, struct Thread *threading ) { printf0("inverting doublet operator\n"); } #endif - PUBLIC_MALLOC( solution.vector_buffer, complex_double, l->inner_vector_size ); - PUBLIC_MALLOC( source.vector_buffer, complex_double, l->inner_vector_size ); + vector_double_alloc( &solution, _INNER, 1, l, threading ); + vector_double_alloc( &source, _INNER, 1, l, threading ); rhs_define( &source, l, threading ); @@ -185,8 +185,8 @@ void solve_driver( level_struct *l, struct Thread *threading ) { norm = global_norm_double( &solution, 0, l->inner_vector_size, l, threading ); printf0("solution vector norm: %le\n",norm); - PUBLIC_FREE( solution.vector_buffer, complex_double, l->inner_vector_size ); - PUBLIC_FREE( source.vector_buffer, complex_double, l->inner_vector_size ); + vector_double_free( &solution, l, threading ); + vector_double_free( &source, l, threading ); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) diff --git a/src/var_table.h b/src/var_table.h index cb83615..dbc6b2a 100644 --- a/src/var_table.h +++ b/src/var_table.h @@ -41,7 +41,7 @@ tt0 = MPI_Wtime(); \ \ if ( g.vt.track_error ) { \ - MALLOC( v.vector_buffer, complex_double, l->inner_vector_size ); \ + vector_double_alloc( &v, _INNER, 1, l, no_threading ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ vector_double_copy( &v, &x, 0, l->inner_vector_size, l ); \ @@ -94,7 +94,7 @@ } \ } \ if ( g.vt.track_error ) { \ - FREE( v.vector_buffer, complex_double, l->inner_vector_size ); \ + vector_double_free( &v, l, no_threading ); \ } \ tt1 = MPI_Wtime(); \ printf0("\n\ntotal time for parameter scan: %d minutes and %d seconds\n", \ diff --git a/src/vector_generic.c b/src/vector_generic.c index 081cc4a..47cfc76 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -26,16 +26,36 @@ void vector_PRECISION_init( vector_PRECISION *vec ) { vec->vector_buffer = NULL; } -/*void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ) { - - MALLOC( vec->vector_buffer, complex_PRECISION, num_vect ); + +void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, Thread *threading ) { + + switch (type){ + case _ORDINARY : PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->vector_size*num_vect ); + break; + case _SCHWARZ : PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*num_vect ); + break; + case _INNER: PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*num_vect ); + break; + } + + vec->type = type; + vec->num_vect = num_vect; + vec->layout = _STANDARD; } -void vector_PRECISION_free( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ) { + +void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *threading ) { - FREE( vec->vector_buffer, complex_PRECISION, num_vect ); + switch (vec->type){ + case _ORDINARY : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->vector_size*vec->num_vect ); + break; + case _SCHWARZ : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*vec->num_vect ); + break; + case _INNER: PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*vec->num_vect ); + break; + } } -*/ + // vector storage for PRECISION precision void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) { diff --git a/src/vector_generic.h b/src/vector_generic.h index 26a3970..ec13387 100644 --- a/src/vector_generic.h +++ b/src/vector_generic.h @@ -25,12 +25,12 @@ struct Thread; void vector_PRECISION_init( vector_PRECISION *vec ); - // void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ); - void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); - void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, + void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, Thread *threading ); + void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *threading); + void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); + void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); - void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x - // void vector_PRECISION_free( vector_PRECISION *vec, const int type, int num_vect, level_struct *l ); + void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x // void vector_PRECISION_test_routine( vector_PRECISION *vec, level_struct *l, struct Thread *threading ); From 4023112db1a844b931556914839d41dc2db9699e Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Fri, 3 Aug 2018 09:18:34 +0300 Subject: [PATCH 12/31] Recover buffer free and remove print --- src/dirac.c | 6 +++++- src/ghost_generic.c | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/dirac.c b/src/dirac.c index e04fe20..8d85319 100644 --- a/src/dirac.c +++ b/src/dirac.c @@ -528,7 +528,11 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { send_size = i; ASSERT(send_size<=max_size); } - + FREE( buffer1, complex_double, max_size ); + FREE( buffer2, complex_double, max_size ); + FREE( buffer3, complex_double, max_size ); + FREE( buffer4, complex_double, max_size ); + } diff --git a/src/ghost_generic.c b/src/ghost_generic.c index b36d78f..bf14b09 100644 --- a/src/ghost_generic.c +++ b/src/ghost_generic.c @@ -371,7 +371,6 @@ void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { - printf0("hello"); int i, j, mu_dir = 2*mu-MIN(dir,0), nu, inv_mu_dir = 2*mu+1+MIN(dir,0), length, *table=NULL, comm_start, num_boundary_sites, site_var; buffer_PRECISION buffer, recv_pt, phi_pt; @@ -424,7 +423,6 @@ void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { - printf0("hello"); int mu_dir = 2*mu-MIN(dir,0), length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var; ASSERT( c->in_use[mu_dir] == 1 ); From f637e7000d3cdd8abe9705b3cdad19155444090a Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Fri, 3 Aug 2018 10:15:34 +0300 Subject: [PATCH 13/31] Change vector init/alloc and unused variables --- src/dirac_generic.c | 6 +++--- src/ghost_generic.c | 4 ++-- src/init_generic.c | 3 --- src/schwarz_generic.c | 6 ++---- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/dirac_generic.c b/src/dirac_generic.c index bc603a8..5259374 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -1455,14 +1455,14 @@ void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l for(int i=0; i<4; i++){ vector_double_init( &vd[i] ); - vector_double_alloc( &vd[i], _INNER, 4, l, threading ); + vector_double_alloc( &vd[i], _INNER, 1, l, threading ); vector_double_init( &vdd[i] ); - vector_double_alloc( &vdd[i], _INNER, 2*4, l, threading ); + vector_double_alloc( &vdd[i], _INNER, 2, l, threading ); } for(int i=0; i<2; i++){ vector_PRECISION_init( &vpp[i] ); - vector_PRECISION_alloc( &vpp[i], _INNER, 2*2, l, threading ); + vector_PRECISION_alloc( &vpp[i], _INNER, 2, l, threading ); } ASSERT(g.n_flavours==2); diff --git a/src/ghost_generic.c b/src/ghost_generic.c index bf14b09..422d1d1 100644 --- a/src/ghost_generic.c +++ b/src/ghost_generic.c @@ -144,8 +144,8 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 2, l, no_threading); MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); #else - //vector_PRECISION_alloc( l->vbuf_PRECISION[8], _ORDINARY, 1, l, no_threading); - MALLOC( l->vbuf_PRECISION[8]->vector_buffer, complex_PRECISION, l->vector_size ); + //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 1, l, no_threading); + MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size ); #endif } } diff --git a/src/init_generic.c b/src/init_generic.c index 84a5814..fc81c31 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -99,14 +99,12 @@ void fine_level_PRECISION_alloc( level_struct *l ) { int n = 8; #ifdef HAVE_TM1p1 for ( int i=0; ivbuf_PRECISION[i]) ); vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, 2, l, no_threading ); } vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 2, l, no_threading ); vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 2, l, no_threading ); #else for ( int i=0; ivbuf_PRECISION[i]) ); vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, 1, l, no_threading ); } vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 1, l, no_threading ); @@ -165,7 +163,6 @@ void next_level_PRECISION_setup( level_struct *l ) { int i, n = (l->next_level->level>0)?6:4; for ( i=0; inext_level->vbuf_PRECISION[i]) ); #ifdef HAVE_TM1p1 vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 2, l->next_level, no_threading ); #else diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index 6cae49f..1cb5ffa 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -139,12 +139,11 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { MALLOC( s->block, block_struct, s->num_blocks ); - int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int svs = l->schwarz_vector_size; int nvec = 1; #ifdef HAVE_TM1p1 svs *= 2; - vs *= 2; nvec = 2; #endif @@ -261,11 +260,10 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { FREE( s->block, block_struct, s->num_blocks ); - int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int svs = l->schwarz_vector_size; #ifdef HAVE_TM1p1 svs *= 2; - vs *= 2; #endif if ( l->depth == 0 ) for ( i=0; i<4; i++ ) From e8ae7bdde1fdef337fe43259d0e36e2b8889c378 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 6 Aug 2018 11:32:11 +0300 Subject: [PATCH 14/31] Added change_layout function --- src/ghost_generic.c | 10 +++- src/main.h | 6 ++- src/main_pre_def_generic.h | 1 + src/solver_analysis.c | 5 ++ src/vector_generic.c | 107 +++++++++++++++++++++++++++++++++++-- src/vector_generic.h | 5 +- 6 files changed, 125 insertions(+), 9 deletions(-) diff --git a/src/ghost_generic.c b/src/ghost_generic.c index 422d1d1..bbc0062 100644 --- a/src/ghost_generic.c +++ b/src/ghost_generic.c @@ -159,8 +159,14 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) { FREE( c->buffer[2*mu], complex_PRECISION, c->max_length[mu] ); FREE( c->buffer[2*mu+1], complex_PRECISION, c->max_length[mu] ); } - if ( l->vbuf_PRECISION[8].vector_buffer != NULL ) - vector_PRECISION_free( &(l->vbuf_PRECISION[8]), l, no_threading); + if ( l->vbuf_PRECISION[8].vector_buffer != NULL ){ + // vector_PRECISION_free( &(l->vbuf_PRECISION[8]), l, no_threading); +#ifdef HAVE_TM1p1 + FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); + #else + FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size ); + #endif + } } diff --git a/src/main.h b/src/main.h index 140100e..862564a 100644 --- a/src/main.h +++ b/src/main.h @@ -180,6 +180,9 @@ #else #define DEBUGOUTPUT( A, FORMAT ) #endif + + #define INDEX_NV_LV_SP_CL( NV, NUM_NV, LV, NUM_LV, SP, NUM_SP, CL, NUM_CL ) CL+NUM_CL*SP+NUM_CL*NUM_SP*LV+NUM_CL*NUM_SP*NUM_LV*NV + #define INDEX_LV_SP_CL_NV( NV, NUM_NV, LV, NUM_LV, SP, NUM_SP, CL, NUM_CL ) NV+NUM_NV*CL+NUM_NV*NUM_CL*SP+NUM_NV*NUM_CL*NUM_SP*LV #include "vectorization_control.h" #include "threading.h" @@ -204,7 +207,8 @@ _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant! enum { _VTS = 20 }; enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB }; - + enum { _NV_LV_SP_CL_RI, _LV_SP_CL_RI_NV }; //vector layout + typedef struct block_struct { int start, color, no_comm, *bt; } block_struct; diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index e8a2b24..ce76aea 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -31,6 +31,7 @@ int num_vect; int layout; int type; + struct level_struct *l; } vector_PRECISION; typedef struct { diff --git a/src/solver_analysis.c b/src/solver_analysis.c index 325165e..5dd75d0 100644 --- a/src/solver_analysis.c +++ b/src/solver_analysis.c @@ -50,6 +50,11 @@ void test_routine( level_struct *l, struct Thread *threading ) { if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading ); } + if ( g.mixed_precision ) + vector_float_test_routine( l, threading ); + else + vector_double_test_routine( l, threading ); + if ( g.interpolation && g.method > 0 ) { if ( g.mixed_precision ) coarse_operator_float_test_routine( l, threading ); diff --git a/src/vector_generic.c b/src/vector_generic.c index 47cfc76..56523af 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -40,7 +40,8 @@ void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect vec->type = type; vec->num_vect = num_vect; - vec->layout = _STANDARD; + vec->layout = _NV_LV_SP_CL_RI; + vec->l = l; } @@ -103,7 +104,105 @@ void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, if(thread == 0 && start != end) PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } -/* -void vector_PRECISION_test_routine( vector_PRECISION *vec, level_struct *l, struct Thread *threading ) { -}*/ + +void vector_PRECISION_check_compatibility( vector_PRECISION *vec1, vector_PRECISION *vec2) { + + + +} + + +void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, Thread *threading ) { + + if(vec_in->layout==layout) return; + + int n, i, s, c, lv = 0, num_s, num_c; + vector_PRECISION vec_tmp; + if( vec_in->vector_buffer == vec_out->vector_buffer ){ + vector_PRECISION_init( &vec_tmp ); + vector_PRECISION_alloc( &vec_tmp, vec_in->type, vec_in->num_vect, vec_in->l, no_threading ); + } else { + vec_tmp = *vec_out; + } + + if(vec_in->l->depth == 0){ + num_s = 4; + num_c = 3; + } else { + num_s = 2; + num_c = vec_in->l->num_parent_eig_vect; + } + + switch (vec_in->type){ + case _ORDINARY : + lv = vec_in->l->num_lattice_sites; + break; + case _SCHWARZ : + lv = 2*vec_in->l->num_lattice_sites - vec_in->l->num_inner_lattice_sites; + break; + case _INNER: + lv = vec_in->l->num_inner_lattice_sites; + break; + } + + switch (layout){ + case _NV_LV_SP_CL_RI : + for( n=0; nnum_vect; n++ ) + for( i=0; inum_vect, i, lv, s, num_s, c, num_c )] = vec_in->vector_buffer[INDEX_LV_SP_CL_NV( n, vec_in->num_vect, i, lv, s, num_s, c, num_c )]; + + vec_out->layout = _NV_LV_SP_CL_RI; + break; + case _LV_SP_CL_RI_NV : + for( i=0; inum_vect; n++ ) + vec_tmp.vector_buffer[INDEX_LV_SP_CL_NV( n, vec_in->num_vect, i, lv, s, num_s, c, num_c )] = vec_in->vector_buffer[INDEX_NV_LV_SP_CL( n, vec_in->num_vect, i, lv, s, num_s, c, num_c )]; + + vec_out->layout = _LV_SP_CL_RI_NV; + break; + } + + if( vec_in->vector_buffer == vec_out->vector_buffer ){ + vector_PRECISION_copy( vec_out, &vec_tmp, 0, lv*num_s*num_c*vec_out->num_vect, vec_out->l ); + vector_PRECISION_free( &vec_tmp, vec_in->l, no_threading ); + } + +} + +void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ) { + + PRECISION diff = 0; + + vector_PRECISION vp[3]; + + for(int i=0; i<3; i++){ + vector_PRECISION_init( &vp[i] ); + vector_PRECISION_alloc( &vp[i], _ORDINARY, 4, l, threading ); + } + + START_LOCKED_MASTER(threading) + + vector_PRECISION_define_random( &vp[0], 0, 4*l->vector_size, l ); + vector_PRECISION_copy( &vp[1], &vp[0], 0, 4*l->vector_size, l ); + vector_PRECISION_change_layout( &vp[1], &vp[1], _LV_SP_CL_RI_NV, no_threading ); + vector_PRECISION_change_layout( &vp[1], &vp[1], _NV_LV_SP_CL_RI, no_threading ); + vector_PRECISION_minus( &vp[2], &vp[1], &vp[0], 0, 4*l->vector_size, l ); + diff = global_norm_PRECISION( &vp[2], 0, 4*l->vector_size, l, no_threading )/ + global_norm_PRECISION( &vp[0], 0, 4*l->vector_size, l, no_threading ); + + test0_PRECISION("depth: %d, correctness of vector PRECISION change layout: %le\n", l->depth, diff ); + + END_LOCKED_MASTER(threading) + for(int i=0; i<3; i++){ + vector_PRECISION_free( &vp[i], l, threading ); + } + if ( l->level == 0 ) + return; + else + vector_PRECISION_test_routine(l->next_level, threading); +} diff --git a/src/vector_generic.h b/src/vector_generic.h index ec13387..d2aeb5e 100644 --- a/src/vector_generic.h +++ b/src/vector_generic.h @@ -31,7 +31,8 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x - - // void vector_PRECISION_test_routine( vector_PRECISION *vec, level_struct *l, struct Thread *threading ); + void vector_PRECISION_check_compatibility( vector_PRECISION *vec1, vector_PRECISION *vec2); + void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, Thread *threading ); + void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ); #endif From 99f9edb71d16180982f50d255fed2175a6418791 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 6 Aug 2018 14:25:35 +0300 Subject: [PATCH 15/31] Added check compatibility function (does not work with TM1p1) --- src/init.c | 6 ++-- src/init_generic.c | 4 +-- src/linalg_generic.c | 16 +++++++++++ src/linsolve_generic.c | 31 +++++++++++++++------ src/linsolve_generic.h | 4 +-- src/main.h | 8 +++--- src/schwarz_generic.c | 6 ++-- src/setup_generic.c | 2 +- src/vector_generic.c | 62 +++++++++++++++++++++++------------------- src/vector_generic.h | 2 +- 10 files changed, 89 insertions(+), 52 deletions(-) diff --git a/src/init.c b/src/init.c index e67e3da..bb36883 100644 --- a/src/init.c +++ b/src/init.c @@ -167,7 +167,7 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, g.tol, + fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, g.tol, _GLOBAL_FGMRES, _RIGHT, preconditioner, g.method==6?g5D_plus_clover_double:d_plus_clover_double, &(g.p), l ); } @@ -193,14 +193,14 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, g.tol, + fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, &(g.p), l ); #ifdef INIT_ONE_PREC } #endif } else if ( g.method == -1 ) { - fgmres_double_struct_alloc( 4, g.restart*g.max_restart, l->inner_vector_size, g.tol, + fgmres_double_struct_alloc( 4, g.restart*g.max_restart, l->inner_vector_size, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, &(g.p), l ); fine_level_double_alloc( l ); } diff --git a/src/init_generic.c b/src/init_generic.c index fc81c31..170f558 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -135,14 +135,14 @@ void next_level_PRECISION_setup( level_struct *l ) { coarsening_index_table_PRECISION_define( &(l->is_PRECISION), &(l->s_PRECISION), l ); if ( l->level == 1 && !l->next_level->idle ) { - fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, + fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, _ORDINARY, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, g.method==6?(g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION) :(g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->next_level->p_PRECISION), l->next_level ); } else { if ( g.kcycle ) { - fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, l->next_level->vector_size, g.kcycle_tol, + fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, l->next_level->vector_size, _ORDINARY, g.kcycle_tol, _K_CYCLE, _RIGHT, vcycle_PRECISION, g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION, &(l->next_level->p_PRECISION), l->next_level ); diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 869cc01..1076820 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -28,6 +28,8 @@ #ifndef OPTIMIZED_LINALG_PRECISION complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { + //vector_PRECISION_check_comp( phi, psi ); + PROF_PRECISION_START( _GIP, threading ); complex_PRECISION local_alpha = 0, global_alpha = 0; @@ -76,6 +78,8 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { + //vector_PRECISION_check_comp( phi, psi ); + PROF_PRECISION_START( _PIP, threading ); int i; complex_PRECISION local_alpha = 0; @@ -107,6 +111,8 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { + //vector_PRECISION_check_comp( phi, psi ); + PROF_PRECISION_START( _PIP, threading ); int i; for(int c=0; cvector_buffer[i])*psi->vector_buffer[i]; denominator += NORM_SQUARE_PRECISION(phi->vector_buffer[i]), i++, l ); @@ -248,6 +256,8 @@ PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { + //vector_PRECISION_check_comp( x, y ); + int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); @@ -261,6 +271,8 @@ void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRE void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { + //vector_PRECISION_check_comp( x, y ); + int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); @@ -273,6 +285,8 @@ void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PR #ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { + + //vector_PRECISION_check_comp( z, x ); int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -317,6 +331,8 @@ void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int star #ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) { + + //vector_PRECISION_check_comp( x, y ); int thread = omp_get_thread_num(); if (thread == 0 && start != end ) diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index 0717d4b..45eb1e1 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -44,7 +44,7 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) { } -void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, const int type, const int prec_kind, +void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type, PRECISION tol, const int type, const int prec_kind, void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ) { /********************************************************************************* @@ -62,7 +62,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co *********************************************************************************/ long int total=0; - int i, k=0; + int i, k=0, n_vl=1; p->restart_length = m; p->num_restart = n; @@ -73,6 +73,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co #ifdef HAVE_TM1p1 vl*=2; + n_vl=2; #endif if(m > 0) { @@ -126,22 +127,36 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co // s p->s = p->H[0] + total; total += m+1; // w - p->w.vector_buffer = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->w), vl_type, n_vl, l, no_threading ); + total += vl; + //p->w.vector_buffer = p->H[0] + total; total += vl; // V for ( i=0; iV[i].vector_buffer = p->H[0] + total; total += vl; + vector_PRECISION_init(&(p->V[i])); + vector_PRECISION_alloc( &(p->V[i]), vl_type, n_vl, l, no_threading ); + total += vl; + //p->V[i].vector_buffer = p->H[0] + total; total += vl; } // Z for ( i=0; iZ[i].vector_buffer = p->H[0] + total; total += vl; + vector_PRECISION_init(&(p->Z[i])); + vector_PRECISION_alloc( &(p->Z[i]), vl_type, n_vl, l, no_threading ); + total += vl; + //p->Z[i].vector_buffer = p->H[0] + total; total += vl; } // x - p->x.vector_buffer = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->x), vl_type, n_vl, l, no_threading ); + total += vl; + //p->x.vector_buffer = p->H[0] + total; total += vl; // r - p->r.vector_buffer = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->r), vl_type, n_vl, l, no_threading ); + total += vl; + //p->r.vector_buffer = p->H[0] + total; total += vl; // b - p->b.vector_buffer = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->b), vl_type, n_vl, l, no_threading ); + total += vl; + //p->b.vector_buffer = p->H[0] + total; total += vl; ASSERT( p->total_storage == total ); } diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h index 1acde04..0000d7f 100644 --- a/src/linsolve_generic.h +++ b/src/linsolve_generic.h @@ -25,8 +25,8 @@ struct Thread; void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ); - void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, const int type, const int prec_kind, - void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct* l ); + void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type, PRECISION tol, const int type, const int prec_kind, + void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ); void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l ); int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); diff --git a/src/main.h b/src/main.h index 862564a..689a7f3 100644 --- a/src/main.h +++ b/src/main.h @@ -39,7 +39,7 @@ #define EPS_double 1E-14 #define HAVE_TM // flag for enable twisted mass - #define HAVE_TM1p1 // flag for enable doublet for twisted mass + //#define HAVE_TM1p1 // flag for enable doublet for twisted mass #undef INIT_ONE_PREC // flag undef for enabling additional features in the lib @@ -181,8 +181,8 @@ #define DEBUGOUTPUT( A, FORMAT ) #endif - #define INDEX_NV_LV_SP_CL( NV, NUM_NV, LV, NUM_LV, SP, NUM_SP, CL, NUM_CL ) CL+NUM_CL*SP+NUM_CL*NUM_SP*LV+NUM_CL*NUM_SP*NUM_LV*NV - #define INDEX_LV_SP_CL_NV( NV, NUM_NV, LV, NUM_LV, SP, NUM_SP, CL, NUM_CL ) NV+NUM_NV*CL+NUM_NV*NUM_CL*SP+NUM_NV*NUM_CL*NUM_SP*LV + #define INDEX_NV_LV_SV( NV, NUM_NV, LV, NUM_LV, SV, NUM_SV ) SV+NUM_SV*LV+NUM_SV*NUM_LV*NV + #define INDEX_LV_SV_NV( NV, NUM_NV, LV, NUM_LV, SV, NUM_SV ) NV+NUM_NV*SV+NUM_NV*NUM_SV*LV #include "vectorization_control.h" #include "threading.h" @@ -207,7 +207,7 @@ _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant! enum { _VTS = 20 }; enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB }; - enum { _NV_LV_SP_CL_RI, _LV_SP_CL_RI_NV }; //vector layout + enum { _NV_LV_SV, _LV_SV_NV }; //vector layout typedef struct block_struct { int start, color, no_comm, *bt; diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index 1cb5ffa..fbf65cd 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -74,19 +74,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { int i, j, n, mu, nu, *bl = l->block_lattice; if ( g.method == 4 ) { - fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, + fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION): (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->sp_PRECISION), l ); } else if ( g.method == 5 ) { - fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, + fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION): (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->sp_PRECISION), l ); } else if ( g.method == 6 ) { - fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, + fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?g5D_apply_schur_complement_PRECISION:g5D_plus_clover_PRECISION): (g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION), diff --git a/src/setup_generic.c b/src/setup_generic.c index 63ee4b5..4fe4d70 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -345,7 +345,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s vector_PRECISION_init( &buf1 ); vector_PRECISION_alloc( &buf1, _ORDINARY, 1, l, no_threading ); fgmres_PRECISION_struct_init( &gmres ); - fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, + fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, _ORDINARY, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, apply_coarse_operator_PRECISION, &gmres, l->next_level ); if ( g.odd_even && l->next_level->level == 0 ) diff --git a/src/vector_generic.c b/src/vector_generic.c index 56523af..688426d 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -40,7 +40,7 @@ void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect vec->type = type; vec->num_vect = num_vect; - vec->layout = _NV_LV_SP_CL_RI; + vec->layout = _NV_LV_SV; vec->l = l; } @@ -52,7 +52,7 @@ void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *thre break; case _SCHWARZ : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*vec->num_vect ); break; - case _INNER: PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*vec->num_vect ); + case _INNER : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*vec->num_vect ); break; } } @@ -79,6 +79,9 @@ void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, in void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { + vector_PRECISION_check_comp( z, x ); + //z->layout = x->layout; + PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); int r_start = 2*start, r_end = 2*end; @@ -94,21 +97,31 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, comp void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ) { + + //vector_PRECISION_check_comp( z, x ); + //z->layout = x->layout; int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _CPY ); VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i], i++, l ); - + if(thread == 0 && start != end) PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } -void vector_PRECISION_check_compatibility( vector_PRECISION *vec1, vector_PRECISION *vec2) { +void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2) { - + if(vec1->num_vect != vec2->num_vect) + error0("Error: The number of vectors have to be the same in both vectors\n"); + + if(vec1->l->level != vec2->l->level) + error0("Error: The level of multigrid must be the same in both vectors\n"); + + if(vec1->type != vec2->type) + error0("Error: The type must be the same in both vectors\n"); } @@ -116,9 +129,12 @@ void vector_PRECISION_check_compatibility( vector_PRECISION *vec1, vector_PRECIS void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, Thread *threading ) { if(vec_in->layout==layout) return; + + vector_PRECISION_check_comp( vec_out, vec_in ); - int n, i, s, c, lv = 0, num_s, num_c; + int n, i, sv, lv = 0, num_sv = vec_in->l->num_lattice_site_var; vector_PRECISION vec_tmp; + if( vec_in->vector_buffer == vec_out->vector_buffer ){ vector_PRECISION_init( &vec_tmp ); vector_PRECISION_alloc( &vec_tmp, vec_in->type, vec_in->num_vect, vec_in->l, no_threading ); @@ -126,14 +142,6 @@ void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION vec_tmp = *vec_out; } - if(vec_in->l->depth == 0){ - num_s = 4; - num_c = 3; - } else { - num_s = 2; - num_c = vec_in->l->num_parent_eig_vect; - } - switch (vec_in->type){ case _ORDINARY : lv = vec_in->l->num_lattice_sites; @@ -147,28 +155,26 @@ void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION } switch (layout){ - case _NV_LV_SP_CL_RI : + case _NV_LV_SV : for( n=0; nnum_vect; n++ ) for( i=0; inum_vect, i, lv, s, num_s, c, num_c )] = vec_in->vector_buffer[INDEX_LV_SP_CL_NV( n, vec_in->num_vect, i, lv, s, num_s, c, num_c )]; + for( sv=0; svnum_vect, i, lv, sv, num_sv )] = vec_in->vector_buffer[INDEX_LV_SV_NV( n, vec_in->num_vect, i, lv, sv, num_sv )]; - vec_out->layout = _NV_LV_SP_CL_RI; + vec_out->layout = _NV_LV_SV; break; - case _LV_SP_CL_RI_NV : + case _LV_SV_NV : for( i=0; inum_vect; n++ ) - vec_tmp.vector_buffer[INDEX_LV_SP_CL_NV( n, vec_in->num_vect, i, lv, s, num_s, c, num_c )] = vec_in->vector_buffer[INDEX_NV_LV_SP_CL( n, vec_in->num_vect, i, lv, s, num_s, c, num_c )]; + for( sv=0; svnum_vect; n++ ) + vec_tmp.vector_buffer[INDEX_LV_SV_NV( n, vec_in->num_vect, i, lv, sv, num_sv )] = vec_in->vector_buffer[INDEX_NV_LV_SV( n, vec_in->num_vect, i, lv, sv, num_sv )]; - vec_out->layout = _LV_SP_CL_RI_NV; + vec_out->layout = _LV_SV_NV; break; } if( vec_in->vector_buffer == vec_out->vector_buffer ){ - vector_PRECISION_copy( vec_out, &vec_tmp, 0, lv*num_s*num_c*vec_out->num_vect, vec_out->l ); + vector_PRECISION_copy( vec_out, &vec_tmp, 0, lv*num_sv*vec_out->num_vect, vec_out->l ); vector_PRECISION_free( &vec_tmp, vec_in->l, no_threading ); } @@ -189,8 +195,8 @@ void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ) vector_PRECISION_define_random( &vp[0], 0, 4*l->vector_size, l ); vector_PRECISION_copy( &vp[1], &vp[0], 0, 4*l->vector_size, l ); - vector_PRECISION_change_layout( &vp[1], &vp[1], _LV_SP_CL_RI_NV, no_threading ); - vector_PRECISION_change_layout( &vp[1], &vp[1], _NV_LV_SP_CL_RI, no_threading ); + vector_PRECISION_change_layout( &vp[1], &vp[1], _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( &vp[1], &vp[1], _NV_LV_SV, no_threading ); vector_PRECISION_minus( &vp[2], &vp[1], &vp[0], 0, 4*l->vector_size, l ); diff = global_norm_PRECISION( &vp[2], 0, 4*l->vector_size, l, no_threading )/ global_norm_PRECISION( &vp[0], 0, 4*l->vector_size, l, no_threading ); diff --git a/src/vector_generic.h b/src/vector_generic.h index d2aeb5e..1ca720e 100644 --- a/src/vector_generic.h +++ b/src/vector_generic.h @@ -31,7 +31,7 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x - void vector_PRECISION_check_compatibility( vector_PRECISION *vec1, vector_PRECISION *vec2); + void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2 ); void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, Thread *threading ); void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ); From b1aa277ba6081326cae54e1025b5f7b279488c91 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 6 Aug 2018 14:48:23 +0300 Subject: [PATCH 16/31] Added check compatibility function (does not work with TM1p1) 2 --- src/init.c | 4 ++-- src/linsolve.c | 33 ++++++++++++++++++++++++--------- src/linsolve.h | 2 +- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/init.c b/src/init.c index bb36883..0c22506 100644 --- a/src/init.c +++ b/src/init.c @@ -152,7 +152,7 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 ) { #endif - fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, + fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, g.tol, _RIGHT, vcycle_float, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) @@ -178,7 +178,7 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 ) { #endif - fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, + fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, g.tol, _NOTHING, NULL, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) diff --git a/src/linsolve.c b/src/linsolve.c index b6ca32f..3d7c0c7 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -28,10 +28,10 @@ void fgmres_MP_struct_init( gmres_MP_struct *p ) { } -void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int prec_kind, +void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, double tol, const int prec_kind, void (*precond)(), gmres_MP_struct *p, level_struct *l ) { long int total=0; - int i, k=0; + int i, k=0, n_vl=1; p->dp.restart_length = m; p->sp.restart_length = m; p->dp.num_restart = n; p->sp.num_restart = n; @@ -61,6 +61,7 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr #ifdef HAVE_TM1p1 vl*=2; + n_vl=2; #endif // double precision part @@ -90,11 +91,17 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr // s p->dp.s = p->dp.H[0] + total; total += m+1; // x - p->dp.x.vector_buffer = p->dp.H[0] + total; total += vl; + vector_double_alloc( &(p->dp.x), vl_type, n_vl, l, no_threading ); + total += vl; + //p->dp.x.vector_buffer = p->dp.H[0] + total; total += vl; // r - p->dp.r.vector_buffer = p->dp.H[0] + total; total += vl; + vector_double_alloc( &(p->dp.r), vl_type, n_vl, l, no_threading ); + total += vl; + //p->dp.r.vector_buffer = p->dp.H[0] + total; total += vl; // b - p->dp.b.vector_buffer = p->dp.H[0] + total; total += vl; + vector_double_alloc( &(p->dp.b), vl_type, n_vl, l, no_threading ); + total += vl; + //p->dp.b.vector_buffer = p->dp.H[0] + total; total += vl; ASSERT( p->dp.total_storage == total ); @@ -117,20 +124,28 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr // precomputed storage amount vector_float_init(&(p->sp.w)); - MALLOC( p->sp.w.vector_buffer, complex_float, total ); + //MALLOC( p->sp.w.vector_buffer, complex_float, total ); // reserve storage total = 0; // w - p->sp.w.vector_buffer = p->sp.w.vector_buffer + total; total += vl; + vector_float_alloc( &(p->sp.w), vl_type, n_vl, l, no_threading ); + total += vl; + //p->sp.w.vector_buffer = p->sp.w.vector_buffer + total; total += vl; // V for ( i=0; isp.V[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; + vector_float_init(&(p->sp.V[i])); + vector_float_alloc( &(p->sp.V[i]), vl_type, n_vl, l, no_threading ); + total += vl; + //p->sp.V[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; } // Z if ( precond != NULL ) { for ( i=0; isp.Z[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; + vector_float_init(&(p->sp.Z[i])); + vector_float_alloc( &(p->sp.Z[i]), vl_type, n_vl, l, no_threading ); + total += vl; + //p->sp.Z[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; } } diff --git a/src/linsolve.h b/src/linsolve.h index 38426d1..f7368d5 100644 --- a/src/linsolve.h +++ b/src/linsolve.h @@ -36,7 +36,7 @@ gmres_float_struct *p, level_struct *l, struct Thread *threading ); void fgmres_MP_struct_init( gmres_MP_struct *p ); - void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int prec_kind, + void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, double tol, const int prec_kind, void (*precond)(), gmres_MP_struct *p, level_struct* l ); void fgmres_MP_struct_free( gmres_MP_struct *p ); From 204845b6c1fb327f945bfd4b334b55e9f5a7381a Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 6 Aug 2018 15:20:56 +0300 Subject: [PATCH 17/31] Fixed fgmres(PRECISION and MG) alloc and free --- src/init.c | 12 ++++++------ src/init_generic.c | 4 ++-- src/linsolve.c | 32 ++++++-------------------------- src/linsolve.h | 4 ++-- src/linsolve_generic.c | 25 ++++++------------------- src/linsolve_generic.h | 2 +- src/schwarz_generic.c | 6 +++--- src/setup_generic.c | 2 +- 8 files changed, 27 insertions(+), 60 deletions(-) diff --git a/src/init.c b/src/init.c index 0c22506..ad29bbc 100644 --- a/src/init.c +++ b/src/init.c @@ -152,7 +152,7 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 ) { #endif - fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, + fgmres_MP_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _RIGHT, vcycle_float, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) @@ -167,7 +167,7 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, g.tol, + fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _RIGHT, preconditioner, g.method==6?g5D_plus_clover_double:d_plus_clover_double, &(g.p), l ); } @@ -178,7 +178,7 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 ) { #endif - fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, + fgmres_MP_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _NOTHING, NULL, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) @@ -193,14 +193,14 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, _INNER, g.tol, + fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, &(g.p), l ); #ifdef INIT_ONE_PREC } #endif } else if ( g.method == -1 ) { - fgmres_double_struct_alloc( 4, g.restart*g.max_restart, l->inner_vector_size, _INNER, g.tol, + fgmres_double_struct_alloc( 4, g.restart*g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, &(g.p), l ); fine_level_double_alloc( l ); } @@ -361,7 +361,7 @@ void method_free( level_struct *l ) { #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 && g.method >= 0 ) { #endif - fgmres_MP_struct_free( &(g.p_MP) ); + fgmres_MP_struct_free( &(g.p_MP), l ); #if defined (INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 FREE( g.p.b, complex_double, 2*l->inner_vector_size ); diff --git a/src/init_generic.c b/src/init_generic.c index 170f558..0fb7b21 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -135,14 +135,14 @@ void next_level_PRECISION_setup( level_struct *l ) { coarsening_index_table_PRECISION_define( &(l->is_PRECISION), &(l->s_PRECISION), l ); if ( l->level == 1 && !l->next_level->idle ) { - fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, _ORDINARY, g.coarse_tol, + fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, _ORDINARY, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, g.method==6?(g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION) :(g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->next_level->p_PRECISION), l->next_level ); } else { if ( g.kcycle ) { - fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, l->next_level->vector_size, _ORDINARY, g.kcycle_tol, + fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, _ORDINARY, g.kcycle_tol, _K_CYCLE, _RIGHT, vcycle_PRECISION, g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION, &(l->next_level->p_PRECISION), l->next_level ); diff --git a/src/linsolve.c b/src/linsolve.c index 3d7c0c7..4cc1209 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -28,7 +28,7 @@ void fgmres_MP_struct_init( gmres_MP_struct *p ) { } -void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, double tol, const int prec_kind, +void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const int prec_kind, void (*precond)(), gmres_MP_struct *p, level_struct *l ) { long int total=0; int i, k=0, n_vl=1; @@ -60,7 +60,6 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, doubl } #ifdef HAVE_TM1p1 - vl*=2; n_vl=2; #endif @@ -69,7 +68,6 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, doubl total += (m+1)*m; // Hessenberg matrix MALLOC( p->dp.H, complex_double*, m ); total += 4*(m+1); // y, gamma, c, s - total += 3*vl; // x, r, b p->dp.total_storage = total; // precomputed storage amount @@ -92,30 +90,21 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, doubl p->dp.s = p->dp.H[0] + total; total += m+1; // x vector_double_alloc( &(p->dp.x), vl_type, n_vl, l, no_threading ); - total += vl; - //p->dp.x.vector_buffer = p->dp.H[0] + total; total += vl; // r vector_double_alloc( &(p->dp.r), vl_type, n_vl, l, no_threading ); - total += vl; - //p->dp.r.vector_buffer = p->dp.H[0] + total; total += vl; // b vector_double_alloc( &(p->dp.b), vl_type, n_vl, l, no_threading ); - total += vl; - //p->dp.b.vector_buffer = p->dp.H[0] + total; total += vl; ASSERT( p->dp.total_storage == total ); // single precision part total = 0; - total += (2+m)*vl; // w, V MALLOC( p->sp.V, vector_float, m+1 ); if ( precond != NULL ) { if ( prec_kind == _RIGHT ) { - total += (m+1)*vl; // Z k = m+1; } else { - total += vl; k = 1; } MALLOC( p->sp.Z, vector_float, k ); @@ -123,29 +112,18 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, doubl p->sp.total_storage = total; // precomputed storage amount - vector_float_init(&(p->sp.w)); - //MALLOC( p->sp.w.vector_buffer, complex_float, total ); - // reserve storage total = 0; // w vector_float_alloc( &(p->sp.w), vl_type, n_vl, l, no_threading ); - total += vl; - //p->sp.w.vector_buffer = p->sp.w.vector_buffer + total; total += vl; // V for ( i=0; isp.V[i])); vector_float_alloc( &(p->sp.V[i]), vl_type, n_vl, l, no_threading ); - total += vl; - //p->sp.V[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; } // Z if ( precond != NULL ) { for ( i=0; isp.Z[i])); vector_float_alloc( &(p->sp.Z[i]), vl_type, n_vl, l, no_threading ); - total += vl; - //p->sp.Z[i].vector_buffer = p->sp.w.vector_buffer + total; total += vl; } } @@ -153,10 +131,10 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, doubl } -void fgmres_MP_struct_free( gmres_MP_struct *p ) { +void fgmres_MP_struct_free( gmres_MP_struct *p, level_struct *l ) { // single precision - FREE( p->sp.w.vector_buffer, complex_float, p->sp.total_storage ); + vector_float_free( &(p->sp.w), l, no_threading ); FREE( p->sp.V, vector_float, p->sp.restart_length+1 ); if ( p->sp.Z != NULL ) FREE( p->sp.Z, vector_float, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 ); @@ -164,7 +142,9 @@ void fgmres_MP_struct_free( gmres_MP_struct *p ) { // double precision FREE( p->dp.H[0], complex_double, p->dp.total_storage ); FREE( p->dp.H, complex_double*, p->dp.restart_length ); - + vector_double_free( &(p->dp.x), l, no_threading ); + vector_double_free( &(p->dp.r), l, no_threading ); + vector_double_free( &(p->dp.b), l, no_threading ); } diff --git a/src/linsolve.h b/src/linsolve.h index f7368d5..3c82b0f 100644 --- a/src/linsolve.h +++ b/src/linsolve.h @@ -36,9 +36,9 @@ gmres_float_struct *p, level_struct *l, struct Thread *threading ); void fgmres_MP_struct_init( gmres_MP_struct *p ); - void fgmres_MP_struct_alloc( int m, int n, long int vl, const int vl_type, double tol, const int prec_kind, + void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const int prec_kind, void (*precond)(), gmres_MP_struct *p, level_struct* l ); - void fgmres_MP_struct_free( gmres_MP_struct *p ); + void fgmres_MP_struct_free( gmres_MP_struct *p, level_struct *l ); int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ); diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index 45eb1e1..39fa64c 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -44,7 +44,7 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) { } -void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type, PRECISION tol, const int type, const int prec_kind, +void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION tol, const int type, const int prec_kind, void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ) { /********************************************************************************* @@ -72,7 +72,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type p->kind = prec_kind; #ifdef HAVE_TM1p1 - vl*=2; n_vl=2; #endif @@ -80,22 +79,18 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type total += (m+1)*m; // Hessenberg matrix MALLOC( p->H, complex_PRECISION*, m ); - total += (5+m)*vl; // x, r, b, w, V MALLOC( p->V, vector_PRECISION, m+1 ); if ( precond != NULL ) { if ( prec_kind == _RIGHT ) { - total += (m+1)*vl; // Z k = m+1; } else { - total += vl; k = 1; } MALLOC( p->Z, vector_PRECISION, k ); } else { #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - total += (m+2)*vl; k = m+2; MALLOC( p->Z, vector_PRECISION, k ); } @@ -128,35 +123,23 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type p->s = p->H[0] + total; total += m+1; // w vector_PRECISION_alloc( &(p->w), vl_type, n_vl, l, no_threading ); - total += vl; - //p->w.vector_buffer = p->H[0] + total; total += vl; // V for ( i=0; iV[i])); vector_PRECISION_alloc( &(p->V[i]), vl_type, n_vl, l, no_threading ); - total += vl; - //p->V[i].vector_buffer = p->H[0] + total; total += vl; } // Z for ( i=0; iZ[i])); vector_PRECISION_alloc( &(p->Z[i]), vl_type, n_vl, l, no_threading ); - total += vl; - //p->Z[i].vector_buffer = p->H[0] + total; total += vl; } // x vector_PRECISION_alloc( &(p->x), vl_type, n_vl, l, no_threading ); - total += vl; - //p->x.vector_buffer = p->H[0] + total; total += vl; // r vector_PRECISION_alloc( &(p->r), vl_type, n_vl, l, no_threading ); - total += vl; - //p->r.vector_buffer = p->H[0] + total; total += vl; // b vector_PRECISION_alloc( &(p->b), vl_type, n_vl, l, no_threading ); - total += vl; - //p->b.vector_buffer = p->H[0] + total; total += vl; ASSERT( p->total_storage == total ); } @@ -221,7 +204,11 @@ void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l ) FREE( p->H[0], complex_PRECISION, p->total_storage ); FREE( p->H, complex_PRECISION*, p->restart_length ); FREE( p->V, vector_PRECISION, p->restart_length+1 ); - + vector_PRECISION_free( &(p->w), l, no_threading ); + vector_PRECISION_free( &(p->x), l, no_threading ); + vector_PRECISION_free( &(p->r), l, no_threading ); + vector_PRECISION_free( &(p->b), l, no_threading ); + if ( p->Z != NULL ) FREE( p->Z, vector_PRECISION, k ); } diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h index 0000d7f..7bc8ae3 100644 --- a/src/linsolve_generic.h +++ b/src/linsolve_generic.h @@ -25,7 +25,7 @@ struct Thread; void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ); - void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, const int vl_type, PRECISION tol, const int type, const int prec_kind, + void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION tol, const int type, const int prec_kind, void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ); void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l ); diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index fbf65cd..1a86313 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -74,19 +74,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { int i, j, n, mu, nu, *bl = l->block_lattice; if ( g.method == 4 ) { - fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, (l->depth==0)?_INNER:_ORDINARY, + fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION): (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->sp_PRECISION), l ); } else if ( g.method == 5 ) { - fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, (l->depth==0)?_INNER:_ORDINARY, + fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION): (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->sp_PRECISION), l ); } else if ( g.method == 6 ) { - fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, (l->depth==0)?_INNER:_ORDINARY, + fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?g5D_apply_schur_complement_PRECISION:g5D_plus_clover_PRECISION): (g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION), diff --git a/src/setup_generic.c b/src/setup_generic.c index 4fe4d70..5570a60 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -345,7 +345,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s vector_PRECISION_init( &buf1 ); vector_PRECISION_alloc( &buf1, _ORDINARY, 1, l, no_threading ); fgmres_PRECISION_struct_init( &gmres ); - fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, _ORDINARY, g.coarse_tol, + fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, _ORDINARY, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, apply_coarse_operator_PRECISION, &gmres, l->next_level ); if ( g.odd_even && l->next_level->level == 0 ) From fc85d494a52b468bb5b9ea9cedd928a8b63cacbc Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Tue, 7 Aug 2018 16:23:48 +0300 Subject: [PATCH 18/31] Added AVX/AVX2/AVX512/OPTIMIZE flags --- src/init_generic.c | 1 + src/linalg_generic.c | 8 ++-- src/main.h | 9 ++++- src/main_pre_def_generic.h | 14 ++++++- src/vector_generic.c | 78 ++++++++++++++++++++++++++++++-------- 5 files changed, 87 insertions(+), 23 deletions(-) diff --git a/src/init_generic.c b/src/init_generic.c index 0fb7b21..1c64c5e 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -52,6 +52,7 @@ void prof_PRECISION_init( level_struct *l ) { sprintf( l->prof_PRECISION.name[_GRAM_SCHMIDT], "Gram-Schmidt, PRECISION" ); sprintf( l->prof_PRECISION.name[_GRAM_SCHMIDT_ON_AGGREGATES], "Gram-Schmidt on aggregates, PRECISION" ); sprintf( l->prof_PRECISION.name[_CPY], "copy operations, PRECISION" ); + sprintf( l->prof_PRECISION.name[_RS], "real scale operations, PRECISION" ); sprintf( l->prof_PRECISION.name[_SET], "set value operations, PRECISION" ); sprintf( l->prof_PRECISION.name[_PR], "interpolation and restriction, PRECISION" ); l->prof_PRECISION.flop[_PR] = level_ratio*l->num_lattice_site_var*8.0*(l->num_lattice_site_var/2); diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 1076820..94be137 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -320,13 +320,13 @@ void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, co void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_PRECISION_START( _CPY ); + //if(thread == 0 && start != end) + //PROF_PRECISION_START( _CPY ); VECTOR_FOR( int i=start, iinner_vector_size ); + //if(thread == 0 && start != end) + //PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } #ifndef OPTIMIZED_LINALG_PRECISION diff --git a/src/main.h b/src/main.h index 689a7f3..0187330 100644 --- a/src/main.h +++ b/src/main.h @@ -32,6 +32,11 @@ #ifndef MAIN_HEADER #define MAIN_HEADER + #define double_SIZE 64 + #define float_SIZE 32 + #define double_LENGTH SIMD_LENGTH/double_SIZE + #define float_LENGTH SIMD_LENGTH/float_SIZE + #define STRINGLENGTH 500 #define _FILE_OFFSET_BITS 64 @@ -87,7 +92,7 @@ #ifdef SSE #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \ printf0("malloc of \"%s\" failed: pointer is not NULL (%s:%d).\n", #variable, __FILE__, __LINE__ ); } \ - if ( (length) > 0 ) { variable = (kind*) memalign( 64, sizeof(kind) * (length) ); } \ + if ( (length) > 0 ) { variable = (kind*) memalign( SIMD_LENGTH, sizeof(kind) * (length) ); } \ if ( variable == NULL && (length) > 0 ) { \ error0("malloc of \"%s\" failed: no memory allocated (%s:%d), current memory used: %lf GB.\n", \ #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \ @@ -204,7 +209,7 @@ enum { _LEFT, _RIGHT, _NOTHING }; enum { _PERIODIC, _ANTIPERIODIC, _TWISTED, _DIRICHLET }; enum { _GIP, _PIP, _LA2, _LA6, _LA8, _LA, _CPY, _SET, _PR, _SC, _NC, _SM, _OP_COMM, _OP_IDLE, _ALLR, _GD_COMM, _GD_IDLE, _GRAM_SCHMIDT, _GRAM_SCHMIDT_ON_AGGREGATES, - _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant! + _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _RS, _NUM_PROF }; // _NUM_PROF has always to be the last constant! enum { _VTS = 20 }; enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB }; enum { _NV_LV_SV, _LV_SV_NV }; //vector layout diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index ce76aea..06ca9e4 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -21,10 +21,20 @@ #ifndef MAIN_PRE_DEF_PRECISION_HEADER #define MAIN_PRE_DEF_PRECISION_HEADER - + + #ifdef AVX + #define SIMD_LENGTH 128 + #elif AVX2 + #define SIMD_LENGTH 256 + #elif AVX512 + #define SIMD_LENGTH 512 + #else + #define SIMD_LENGTH 128 + #endif + typedef PRECISION complex complex_PRECISION; typedef PRECISION complex *config_PRECISION; - typedef PRECISION complex *buffer_PRECISION; + typedef PRECISION complex *buffer_PRECISION __attribute__ ((aligned (SIMD_LENGTH))); typedef struct { buffer_PRECISION vector_buffer; diff --git a/src/vector_generic.c b/src/vector_generic.c index 688426d..9f40f44 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -65,9 +65,16 @@ void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, in if(thread == 0 && start != end) PROF_PRECISION_START( _SET ); if ( phi->vector_buffer != NULL ) { - int i; - for ( i=start; ivector_buffer[i] = value; + //int i; + //for ( i=start; ivector_buffer[i] = value; + for(int i=start; ivector_buffer[i+j] = value; } else { error0("Error in \"vector_PRECISION_define\": pointer is null\n"); } @@ -82,31 +89,72 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, comp vector_PRECISION_check_comp( z, x ); //z->layout = x->layout; - PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); - int r_start = 2*start, r_end = 2*end; - int thread = omp_get_thread_num(); if(thread == 0 && start != end) - PROF_PRECISION_START( _LA2 ); - - REAL_VECTOR_FOR( int i=r_start, ivector_buffer, r_alpha = creal_PRECISION(alpha); +#else + PRECISION *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); +#endif + int r_start = 2*start, r_end = 2*end; + + //REAL_VECTOR_FOR( int i=r_start, ivector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); +#else + PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); +#endif + int r_start = 2*start, r_end = 2*end; + + //REAL_VECTOR_FOR( int i=r_start, iinner_vector_size ); + PROF_PRECISION_STOP( _RS, (double)(end-start)/(double)l->inner_vector_size ); } void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ) { + if(z == x) return; + //vector_PRECISION_check_comp( z, x ); //z->layout = x->layout; - +#ifdef OPTIMIZE + buffer_PRECISION restrict z_pt=z->vector_buffer, restrict x_pt=x->vector_buffer; +#else + buffer_PRECISION z_pt=z->vector_buffer, x_pt=x->vector_buffer; +#endif int thread = omp_get_thread_num(); if(thread == 0 && start != end) - PROF_PRECISION_START( _CPY ); + PROF_PRECISION_START( _CPY ); + //VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i], i++, l ); - if(thread == 0 && start != end) PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } From 6cb10298047ff9fe444bb678084c686c22c40d48 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Wed, 8 Aug 2018 09:57:15 +0300 Subject: [PATCH 19/31] Fixed vector malloc error --- src/linsolve.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/linsolve.c b/src/linsolve.c index 4cc1209..3b08843 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -118,11 +118,13 @@ void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const vector_float_alloc( &(p->sp.w), vl_type, n_vl, l, no_threading ); // V for ( i=0; isp.V[i]) ); vector_float_alloc( &(p->sp.V[i]), vl_type, n_vl, l, no_threading ); } // Z if ( precond != NULL ) { for ( i=0; isp.Z[i]) ); vector_float_alloc( &(p->sp.Z[i]), vl_type, n_vl, l, no_threading ); } } From 0222e7a6fc3e4ab2cd6c8140401e3cfa4f77fe79 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Thu, 9 Aug 2018 17:19:33 +0300 Subject: [PATCH 20/31] Multiple rhs for method 0 (no vectorization) --- src/data_generic.c | 19 ++ src/data_generic.h | 2 +- src/dirac_generic.c | 414 ++++++++++++++++++++++++++++++++++++ src/dirac_generic.h | 4 +- src/init.c | 5 +- src/linsolve_generic.c | 325 ++++++++++++++++++++++++++-- src/linsolve_generic.h | 6 +- src/main.c | 6 +- src/main.h | 3 + src/main_post_def_generic.h | 6 + src/main_pre_def_generic.h | 1 + src/operator_generic.c | 6 +- src/schwarz_generic.c | 3 +- src/solver_analysis.c | 4 +- src/top_level.c | 87 ++++---- src/vector_generic.c | 58 ++++- src/vector_generic.h | 6 +- 17 files changed, 884 insertions(+), 71 deletions(-) diff --git a/src/data_generic.c b/src/data_generic.c index ba63e3c..c666644 100644 --- a/src/data_generic.c +++ b/src/data_generic.c @@ -54,3 +54,22 @@ void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, if(thread == 0 && start != end) PROF_PRECISION_STOP( _SET, 1 ); } + + +void vector_PRECISION_define_random_new( vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end( 0, (phi->size)*(phi->num_vect), &start, &end, l, threading ); + int thread = omp_get_thread_num(); + if(thread == 0) + PROF_PRECISION_START( _SET ); + if ( phi != NULL ) { + int i; + for ( i=start; ivector_buffer[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I; + } else { + error0("Error in \"vector_PRECISION_define_random\": pointer is null\n"); + } + if(thread == 0) + PROF_PRECISION_STOP( _SET, 1 ); +} diff --git a/src/data_generic.h b/src/data_generic.h index 9ac8a58..76fd875 100644 --- a/src/data_generic.h +++ b/src/data_generic.h @@ -24,5 +24,5 @@ void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l ); - + void vector_PRECISION_define_random_new( vector_PRECISION *phi, level_struct *l, struct Thread *threading ); #endif diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 5259374..93536d1 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -166,6 +166,153 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR } + +void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, int start, int end, + level_struct *l, struct Thread *threading ) { + + int nv = l->num_lattice_site_var; + int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; + buffer_PRECISION lphi = phi->vector_buffer+start+phi_shift, leta = eta->vector_buffer+start+eta_shift; + buffer_PRECISION leta_end = eta->vector_buffer+end+eta->size*n_vec; +#ifdef PROFILING + START_MASTER(threading) + PROF_PRECISION_START( _SC ); + END_MASTER(threading) +#endif + +#ifdef HAVE_TM + config_PRECISION tm_term = op->tm_term+(start/nv)*12; +#endif + + if ( g.csw == 0.0 ) { + + config_PRECISION clover = op->clover+(start/nv)*12; +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + clover -= 6; + tm_term -= 6; + FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + clover -= 6; + tm_term -= 6; + FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + } + else +#endif + while ( leta < leta_end ) { + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + clover -= 6; + FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + clover -= 6; + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + } + } else { +#endif +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { + while ( leta < leta_end ) + FOR12( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + } else +#endif + while ( leta < leta_end ) + FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); +#ifdef HAVE_TM1p1 + } +#endif + + } else { + +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + + config_PRECISION clover = op->clover+(start/nv)*42; +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + doublet_site_clover_PRECISION( leta, lphi, clover ); + clover+=42; + FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + tm_term -= 6; + FOR6( *leta -=(*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + tm_term -= 6; + FOR6( *leta -= (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + } + else +#endif + while ( leta < leta_end ) { + doublet_site_clover_PRECISION( leta, lphi, clover ); + leta+=24; lphi+=24; + clover+=42; + } + } else { +#endif +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + site_clover_PRECISION( leta, lphi, clover ); + FOR12( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + clover+=42; + } + else +#endif + while ( leta < leta_end ) { + site_clover_PRECISION( leta, lphi, clover ); + leta+=12; lphi+=12; + clover+=42; + } +#ifdef HAVE_TM1p1 + } +#endif + +#else + +#ifdef HAVE_TM1p1 + PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized; +#else + PRECISION *clover = op->clover_vectorized; +#endif + clover += start*12; + while ( leta < leta_end ) { // tm_term included in the clover vectorized + sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover ); + leta += nv; lphi += nv; + clover += 12*nv; + } + +#endif + + } + +#ifdef HAVE_TM1p1 + config_PRECISION eps_term = op->epsbar_term+(start/nv)*12; + lphi = phi->vector_buffer+start+phi_shift, leta = eta->vector_buffer+start+eta_shift; + if ( g.n_flavours == 2 && + ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) + while ( leta < leta_end ) { + lphi += 6; + FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; ) + lphi -= 12; + eps_term -= 6; + FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; ) + lphi += 6; + } +#endif + + +#ifdef PROFILING + START_MASTER(threading) + PROF_PRECISION_STOP( _SC, 1 ); + END_MASTER(threading) +#endif + +} + + static void spin0and1_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer; @@ -618,6 +765,273 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper } +void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { + + int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var; +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; + complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; +#else + int i, j, *nb_pt; + buffer_PRECISION phi_pt, eta_pt, end_pt; + config_PRECISION D_pt; + int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; +#endif + + compute_core_start_end(0, nv*n, &start, &end, l, threading ); + + SYNC_MASTER_TO_ALL(threading) + clover_PRECISION_new(eta, phi, n_vec, op, start, end, l, threading ); + START_MASTER(threading) + PROF_PRECISION_START( _NC ); + END_MASTER(threading) + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dprp_PRECISION( prn, phi->vector_buffer, start, end ); +#else + complex_PRECISION pbuf[12]; + for ( i=start/2, phi_pt=phi->vector_buffer+start+phi_shift; iprnT+i, phi_pt ); + dprp_Z_PRECISION( op->prnZ+i, phi_pt ); + dprp_Y_PRECISION( op->prnY+i, phi_pt ); + dprp_X_PRECISION( op->prnX+i, phi_pt ); + } +#endif + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dprn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); +#else + // project plus dir and multiply with U dagger + for ( phi_pt=phi->vector_buffer+start+phi_shift,c end_pt=phi->vector_buffer+end+phi_shift, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptprpT+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpT+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpT+j+9, D_pt, pbuf+9 ); D_pt += 9; + // Z dir + j = nv/2*(*nb_pt); nb_pt++; + dprn_Z_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpZ+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpZ+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpZ+j+9, D_pt, pbuf+9 ); D_pt += 9; + // Y dir + j = nv/2*(*nb_pt); nb_pt++; + dprn_Y_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpY+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpY+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpY+j+9, D_pt, pbuf+9 ); D_pt += 9; + // X dir + j = nv/2*(*nb_pt); nb_pt++; + dprn_X_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpX+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpX+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9; + } +#endif + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + su3_dpbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); +#else + // multiply with U and lift up minus dir + for ( eta_pt=eta->vector_buffer+start+eta_shift, end_pt=eta->vector_buffer+end+eta_shift, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnT+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnT+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnT+j+9 ); + dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Z dir + j = nv/2*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnZ+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnZ+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnZ+j+9 ); + dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Y dir + j = nv/2*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnY+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnY+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnY+j+9 ); + dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + // X dir + j = nv/2*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnX+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnX+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 ); + dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + } +#endif + + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // lift up plus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dpbn_PRECISION( eta->vector_buffer, prp, start, end ); +#else + for ( i=start/2, eta_pt=eta->vector_buffer+start+eta_shift; iprpT+i, eta_pt ); + dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); + dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); + dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + } +#endif + } else { +#endif + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + prp_PRECISION( prn, phi->vector_buffer, start, end ); +#else + complex_PRECISION pbuf[6]; + for ( i=start/2, phi_pt=phi->vector_buffer+start+phi_shift; iprnT+i, phi_pt ); + prp_Z_PRECISION( op->prnZ+i, phi_pt ); + prp_Y_PRECISION( op->prnY+i, phi_pt ); + prp_X_PRECISION( op->prnX+i, phi_pt ); + } +#endif + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // project plus dir and multiply with U dagger +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); +#else + for ( phi_pt=phi->vector_buffer+start+phi_shift, end_pt=phi->vector_buffer+end+phi_shift, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpT+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); D_pt += 9; + // Z dir + j = 6*(*nb_pt); nb_pt++; + prn_Z_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpZ+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); D_pt += 9; + // Y dir + j = 6*(*nb_pt); nb_pt++; + prn_Y_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpY+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); D_pt += 9; + // X dir + j = 6*(*nb_pt); nb_pt++; + prn_X_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpX+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9; + } +#endif + + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // multiply with U and lift up minus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + su3_pbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); +#else + for ( eta_pt=eta->vector_buffer+start+eta_shift, end_pt=eta->vector_buffer+end+eta_shift, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnT+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 ); + pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Z dir + j = 6*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnZ+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 ); + pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Y dir + j = 6*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnY+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 ); + pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + // X dir + j = 6*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnX+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); + pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + } +#endif + + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // lift up plus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + pbn_PRECISION( eta->vector_buffer, prp, start, end ); +#else + for ( i=start/2, eta_pt=eta->vector_buffer+start+eta_shift; iprpT+i, eta_pt ); + pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); + pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); + pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + } +#endif +#ifdef HAVE_TM1p1 + } +#endif + + START_MASTER(threading) + PROF_PRECISION_STOP( _NC, 1 ); + END_MASTER(threading) + + SYNC_MASTER_TO_ALL(threading) +} + + + void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); diff --git a/src/dirac_generic.h b/src/dirac_generic.h index 672c718..fdcb4b3 100644 --- a/src/dirac_generic.h +++ b/src/dirac_generic.h @@ -29,8 +29,10 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); - + void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); + void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void d_plus_clover_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); diff --git a/src/init.c b/src/init.c index ad29bbc..31426b4 100644 --- a/src/init.c +++ b/src/init.c @@ -193,8 +193,11 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, + /*fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, + &(g.p), l );*/ + fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, + _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double_new, &(g.p), l ); #ifdef INIT_ONE_PREC } diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index 39fa64c..9e3e036 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -62,7 +62,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION t *********************************************************************************/ long int total=0; - int i, k=0, n_vl=1; + int i, k=0, n_vl=g.num_rhs_vect;//, n_vl2=1; p->restart_length = m; p->num_restart = n; @@ -72,7 +72,8 @@ void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION t p->kind = prec_kind; #ifdef HAVE_TM1p1 - n_vl=2; + n_vl*=2; + //n_vl2=2; #endif if(m > 0) { @@ -229,7 +230,8 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread int start; int end; - int j=-1, finish=0, iter=0, il, ol, res; + //int j=-1, finish=0, iter=0, il, ol, res, n_vec=0; + int iter=0, il, ol, res, n_vec=0; complex_PRECISION gamma0 = 0; complex_PRECISION beta = 0; @@ -249,13 +251,18 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread SYNC_MASTER_TO_ALL(threading) // compute start and end indices for core // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + for( n_vec=0; n_vecv_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); + printf0("n_vec=%d\n", n_vec); + for( ol=0; olnum_restart && finish==0; ol++ ) { - if( ol == 0 && p->initial_guess_zero ) { res = _NO_RES; vector_PRECISION_copy( &(p->r), &(p->b), start, end, l ); + //vector_PRECISION_copy_new( &(p->r), &(p->b), l, threading ); } else { res = _RES; if ( p->kind == _LEFT && p->preconditioner ) { @@ -267,11 +274,13 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } p->preconditioner( &(p->w), NULL, &(p->Z[0]), _NO_RES, l, threading ); } else { - apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x + //apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x + apply_operator_PRECISION_new( &(p->w), &(p->x), n_vec, p, l, threading ); } vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w } - gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) + //gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) + gamma0 = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); START_MASTER(threading) p->gamma[0] = gamma0; END_MASTER(threading); @@ -315,7 +324,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } } #else - if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION_new( p->V, p->Z, &(p->w), p->H, p->y, j, n_vec, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } @@ -345,8 +354,10 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread break; } } // end of a single restart - compute_solution_PRECISION( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), - p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading ); + /*compute_solution_PRECISION( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), + p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading );*/ + compute_solution_PRECISION_new( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), + p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, n_vec, p, l, threading ); } // end of fgmres START_LOCKED_MASTER(threading) @@ -355,9 +366,10 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( p->print ) { #ifdef FGMRES_RESTEST - apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); + //apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); + apply_operator_PRECISION_new( &(p->w), &(p->x), n_vec, p, l, threading ); vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); - beta = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); + beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); #else beta = gamma_jp1; #endif @@ -370,6 +382,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread printf0("| FGMRES iterations: %-6d coarse average: %-6.2lf |\n", iter, ((double)g.coarse_iter_count)/((double)iter) ); printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta)/norm_r0 ); + printf0("| soltion for the vector : n_vec = %d |\n", n_vec+1 ); printf0("| elapsed wall clock time: %-8.4lf seconds |\n", t1-t0 ); if ( g.coarse_time > 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -414,7 +427,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( g.method != 6 ) prof_print( l ); END_MASTER(threading) } - + } return iter; } @@ -898,6 +911,246 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE } + +int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, + complex_PRECISION **H, complex_PRECISION* buffer, int j, int n_vec, void (*prec)(), + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + +/********************************************************************************* +* Extends the Arnoldi basis by one vector. +* - vector_PRECISION **V: Contains the Arnoldi basis vectors. +* - vector_PRECISION **Z: If a right precond. P is used, contains P*V[j] for all j. +* - vector_PRECISION *w: Will be appended to existing Arnoldi basis at +* position j+1. +* - complex_PRECISION **H: Contains full Hessenberg matrix from the Arnoldi +* decomposition (columnmajor!) +* - complex_PRECISION* buffer: Buffer for local inner products. +* - int j: index of the new Arnoldi vector to be orthonormalized +* against all previous ones. +* - void (*prec)(): Function pointer to preconditioner (can be NULL if no +* preconditioning is used). +*********************************************************************************/ +#ifdef SINGLE_ALLREDUCE_ARNOLDI +#ifdef PIPELINED_ARNOLDI + if ( l->level == 0 && l->depth > 0 ) { + SYNC_MASTER_TO_ALL(threading) + SYNC_CORES(threading) + MPI_Request req; + MPI_Status stat; + int start, end, i; + const complex_PRECISION sigma = 0; + compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + + if ( j == 0 ) + vector_PRECISION_copy( &Z[0], &V[0], start, end, l ); + else + vector_PRECISION_copy( &V[j], &Z[j], start, end, l ); + + complex_PRECISION tmp[j+1]; + process_multi_inner_product_PRECISION( j+1, tmp, V, &V[j], p->v_start, p->v_end, l, threading ); + START_MASTER(threading) + PROF_PRECISION_START( _ALLR ); + for( i=0; i<=j; i++ ) + buffer[i] = tmp[i]; + if ( g.num_processes > 1 ) { + MPI_Iallreduce( buffer, H[MAX(0,j-1)], j+1, MPI_COMPLEX_PRECISION, MPI_SUM, + (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm, &req ); + } else { + for( i=0; i<=j; i++ ) + H[MAX(0,j-1)][i] = buffer[i]; + } + PROF_PRECISION_STOP( _ALLR, 1 ); + END_MASTER(threading) + + apply_operator_PRECISION( &Z[j+1], &Z[j], p, l, threading ); + + START_MASTER(threading) + PROF_PRECISION_START( _ALLR ); + if ( g.num_processes > 1 ) { + MPI_Wait( &req, &stat ); + } + PROF_PRECISION_STOP( _ALLR, 0 ); + if ( j > 0 ) { + for ( i=0; i 0 ) { + H[j-1][j-1] += sigma; + } + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + if ( j == 0 ) { + if ( sigma ) vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[j], -sigma, start, end, l ); + } else { + for( i=0; iv_start, p->v_end, &start, &end, l, threading); + + if ( prec != NULL ) { + if ( p->kind == _LEFT ) { + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( &V[j+1], NULL, &Z[0], _NO_RES, l, threading ); + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); + } else { + if ( l->level == 0 ) { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); + } else { + if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { + prec( &Z[j], &V[j+1], &V[j], _NO_RES, l, threading ); + // obtains w = D * Z[j] from Schwarz + } else { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); // w = D*Z[j] + } + } + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); + + } + } else { + apply_operator_PRECISION( &V[j+1], &V[j], p, l, threading ); // w = D*V[j] + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); + } + + complex_PRECISION tmp[j+2]; + process_multi_inner_product_PRECISION( j+2, tmp, V, &V[j+1], p->v_start, p->v_end, l, threading ); + START_MASTER(threading) + for( i=0; i<=j+1; i++ ) + buffer[i] = tmp[i]; + + if ( g.num_processes > 1 ) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, H[j], j+2, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + } else { + for( i=0; i<=j+1; i++ ) + H[j][i] = buffer[i]; + } + for ( i=0; i<=j; i++ ) + H[j][j+1] -= conj( H[j][i] )*H[j][i]; + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + if ( creal( H[j][j+1] ) < 0 ) + return 0; + START_MASTER(threading) + H[j][j+1] = sqrt( creal( H[j][j+1] ) ); + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + for( i=0; i<=j; i++ ) + vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[i], -H[j][i], start, end, l ); + vector_PRECISION_real_scale( &V[j+1], &V[j+1], 1/H[j][j+1], start, end, l ); + START_LOCKED_MASTER(threading) + H[j][j] += sigma; + END_LOCKED_MASTER(threading) +#ifdef PIPELINED_ARNOLDI + } +#endif +#else + SYNC_MASTER_TO_ALL(threading) + SYNC_CORES(threading) + int i; + // start and end indices for vector functions depending on thread + int start, end; + // compute start and end indices for core + // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads + compute_core_start_end(p->v_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, &start, &end, l, threading); + + if ( prec != NULL ) { + if ( p->kind == _LEFT ) { + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); + } else { + if ( l->level == 0 ) { + apply_operator_PRECISION( w, &Z[j], p, l, threading ); + } else { + if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); + // obtains w = D * Z[j] from Schwarz + } else { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( w, &Z[j], p, l, threading ); // w = D*Z[j] + } + } + } + } else { + apply_operator_PRECISION_new( w, &V[j], n_vec, p, l, threading ); // w = D*V[j] + } + + // orthogonalization + complex_PRECISION tmp[j+1]; + process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, l, threading ); + START_MASTER(threading) + for( i=0; i<=j; i++ ) + buffer[i] = tmp[i]; + if ( g.num_processes > 1 ) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, H[j], j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + } else { + for( i=0; i<=j; i++ ) + H[j][i] = buffer[i]; + } + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + for( i=0; i<=j; i++ ) + vector_PRECISION_saxpy( w, w, &V[i], -H[j][i], start, end, l ); +#ifdef REORTH + // re-orthogonalization + process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, l, threading ); + START_MASTER(threading) + for( i=0; i<=j; i++ ) + buffer[i] = tmp[i]; + if ( g.num_processes > 1 ) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, tmp, j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + } + + for( i=0; i<=j; i++ ) + H[j][i] += tmp[i]; + + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + for( i=0; i<=j; i++ ) + vector_PRECISION_saxpy( w, w, &V[i], -tmp[i], start, end, l ); +#endif + + // normalization + PRECISION tmp2 = global_norm_PRECISION( w, p->v_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, l, threading ); + START_MASTER(threading) + H[j][j+1] = tmp2; + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + // V_j+1 = w / H_j+1,j + if ( cabs_PRECISION( H[j][j+1] ) > 1e-15 ) + vector_PRECISION_real_scale( &V[j+1], w, 1/H[j][j+1], start, end, l ); +#endif + return 1; +} + + void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, complex_PRECISION *c, complex_PRECISION *gamma, int j, level_struct *l, struct Thread *threading ) { @@ -987,6 +1240,52 @@ void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, compl } +void compute_solution_PRECISION_new( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, + complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol, int n_vec, + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + + int i, k; + // start and end indices for vector functions depending on thread + int start; + int end; + // compute start and end indices for core + // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads + compute_core_start_end(p->v_start+x->size*n_vec, p->v_end+x->size*n_vec, &start, &end, l, threading); + + START_MASTER(threading) + + PROF_PRECISION_START( _SMALL2 ); + + // backward substitution + for ( i=j; i>=0; i-- ) { + y[i] = gamma[i]; + for ( k=i+1; k<=j; k++ ) { + y[i] -= H[k][i]*y[k]; + } + y[i] /= H[i][i]; + } + + PROF_PRECISION_STOP( _SMALL2, ((j+1)*(j+2))/2 + j+1 ); + + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + // x = x + V*y + if ( ol ) { + for ( i=0; i<=j; i++ ) { + vector_PRECISION_saxpy( x, x, &V[i], y[i], start, end, l ); + } + } else { + vector_PRECISION_scale( x, &V[0], y[0], start, end, l ); + for ( i=1; i<=j; i++ ) { + vector_PRECISION_saxpy( x, x, &V[i], y[i], start, end, l ); + } + } +} + + + + void local_minres_PRECISION( vector_PRECISION *phi, vector_PRECISION *eta, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h index 7bc8ae3..902dc80 100644 --- a/src/linsolve_generic.h +++ b/src/linsolve_generic.h @@ -38,10 +38,14 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); + int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, + complex_PRECISION **H, complex_PRECISION* buffer, int j, int n_vec, void (*prec)(), + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, complex_PRECISION *c, complex_PRECISION *gamma, int j, level_struct *l, struct Thread *threading ); void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); - + void compute_solution_PRECISION_new( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, + complex_PRECISION **H, int j, int ol, int n_vec, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); #endif diff --git a/src/main.c b/src/main.c index ef2c3cb..a24adf8 100644 --- a/src/main.c +++ b/src/main.c @@ -57,7 +57,9 @@ int main( int argc, char **argv ) { } method_init( &argc, &argv, &l ); - + + g.num_rhs_vect = 4; + no_threading = (struct Thread *)malloc(sizeof(struct Thread)); setup_no_threading(no_threading, &l); @@ -89,7 +91,7 @@ int main( int argc, char **argv ) { solve_driver( &l, &threading ); } - + printf0("Number of rhs vectors = %d\n", g.num_rhs_vect); finalize_common_thread_data(commonthreaddata); finalize_no_threading(no_threading); free(commonthreaddata); diff --git a/src/main.h b/src/main.h index 0187330..8f209fb 100644 --- a/src/main.h +++ b/src/main.h @@ -401,6 +401,9 @@ // bc: 0 dirichlet, 1 periodic, 2 anti-periodic int bc; + // number of rhs vectors (b) to be solved at the same time (hopefully) + int num_rhs_vect; + complex_double **gamma; var_table vt; diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h index 4817c43..e7d1bc6 100644 --- a/src/main_post_def_generic.h +++ b/src/main_post_def_generic.h @@ -32,6 +32,12 @@ } + static inline void apply_operator_PRECISION_new( vector_PRECISION *output, vector_PRECISION *input, int n, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + + p->eval_operator( output, input, n, p->op, l, threading ); + + } + static inline void apply_operator_dagger_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index 06ca9e4..3cbbeba 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -41,6 +41,7 @@ int num_vect; int layout; int type; + int size; struct level_struct *l; } vector_PRECISION; diff --git a/src/operator_generic.c b/src/operator_generic.c index 28bb595..f08a44c 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -409,10 +409,10 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc START_LOCKED_MASTER(threading) vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); - apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading ); + apply_operator_double_new( &vd[1], &vd[0], 0, &(g.p), l, no_threading ); trans_PRECISION( &vp[0], &vd[0], op->translation_table, l, no_threading ); - apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION_new( &vp[1], &vp[0], 0, &(l->p_PRECISION), l, no_threading ); trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); @@ -423,7 +423,7 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, threading ); + apply_operator_PRECISION_new( &vp[1], &vp[0], 0, &(l->p_PRECISION), l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index 1a86313..b513f88 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -32,7 +32,8 @@ void smoother_PRECISION_def( level_struct *l ) { if ( g.method == 6 ) { l->p_PRECISION.eval_operator = (l->depth > 0)?g5D_apply_coarse_operator_PRECISION:g5D_plus_clover_PRECISION; } else { - l->p_PRECISION.eval_operator = (l->depth > 0)?apply_coarse_operator_PRECISION:d_plus_clover_PRECISION; + //l->p_PRECISION.eval_operator = (l->depth > 0)?apply_coarse_operator_PRECISION:d_plus_clover_PRECISION; + l->p_PRECISION.eval_operator = d_plus_clover_PRECISION_new; } } diff --git a/src/solver_analysis.c b/src/solver_analysis.c index 5dd75d0..1c1a20b 100644 --- a/src/solver_analysis.c +++ b/src/solver_analysis.c @@ -50,11 +50,11 @@ void test_routine( level_struct *l, struct Thread *threading ) { if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading ); } - if ( g.mixed_precision ) + /* if ( g.mixed_precision ) vector_float_test_routine( l, threading ); else vector_double_test_routine( l, threading ); - +*/ if ( g.interpolation && g.method > 0 ) { if ( g.mixed_precision ) coarse_operator_float_test_routine( l, threading ); diff --git a/src/top_level.c b/src/top_level.c index cd1d672..83fbd55 100644 --- a/src/top_level.c +++ b/src/top_level.c @@ -27,19 +27,23 @@ void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ) if(threading->thread != 0) return; - int start = threading->start_index[l->depth]; - int end = threading->end_index[l->depth]; + //int start = threading->start_index[l->depth]; + //int end = threading->end_index[l->depth]; if ( g.rhs == 0 ) { - vector_double_define( rhs, 1, start, end, l ); + //vector_double_define( rhs, 1, start, end, l ); + vector_double_define_new( rhs, 1, l, threading ); START_MASTER(threading) if ( g.print > 0 ) printf0("rhs = ones\n"); END_MASTER(threading) } else if ( g.rhs == 1 ) { - vector_double_define( rhs, 0, start, end, l ); + //vector_double_define( rhs, 0, start, end, l ); + vector_double_define_new( rhs, 0, l, threading ); if ( g.my_rank == 0 ) { START_LOCKED_MASTER(threading) - rhs->vector_buffer[0] = 1.0; + //rhs->vector_buffer[0] = 1.0; + for ( int i=0; inum_vect; i++ ) + rhs->vector_buffer[i*(rhs->size)] = 1.0; END_LOCKED_MASTER(threading) } START_MASTER(threading) @@ -48,13 +52,15 @@ void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ) } else if ( g.rhs == 2 ) { // this would yield different results if we threaded it, so we don't START_LOCKED_MASTER(threading) - vector_double_define_random( rhs, 0, l->inner_vector_size, l ); + //vector_double_define_random( rhs, 0, l->inner_vector_size, l ); + vector_double_define_random_new( rhs, l, threading ); END_LOCKED_MASTER(threading) START_MASTER(threading) if ( g.print > 0 ) printf0("rhs = random\n"); END_MASTER(threading) } else if ( g.rhs == 3 ) { - vector_double_define( rhs, 0, start, end, l ); + //vector_double_define( rhs, 0, start, end, l ); + vector_double_define_new( rhs, 0, l, threading ); } else { ASSERT( g.rhs >= 0 && g.rhs <= 4 ); } @@ -64,7 +70,7 @@ void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ) int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) { - int iter = 0, start = threading->start_index[l->depth], end = threading->end_index[l->depth]; + int iter = 0; //, start = threading->start_index[l->depth], end = threading->end_index[l->depth]; vector_double rhs = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.b:g.p.b; vector_double sol = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.x:g.p.x; @@ -79,7 +85,8 @@ int wilson_driver( vector_double *solution, vector_double *source, level_struct double tmp_t = -MPI_Wtime(); #endif - vector_double_copy( &rhs, source, start, end, l ); + //vector_double_copy( &rhs, source, start, end, l ); + vector_double_copy_new( &rhs, source, l, threading ); if ( g.method == -1 ) { cgn_double( &(g.p), l, threading ); } else if ( g.mixed_precision == 2 ) { @@ -87,7 +94,8 @@ int wilson_driver( vector_double *solution, vector_double *source, level_struct } else { iter = fgmres_double( &(g.p), l, threading ); } - vector_double_copy( solution, &sol, start, end, l ); + //vector_double_copy( solution, &sol, start, end, l ); + vector_double_copy_new( solution, &sol, l, threading ); #ifdef WILSON_BENCHMARK tmp_t += MPI_Wtime(); if ( tmp_t < t_min ) @@ -138,52 +146,57 @@ void solve_driver( level_struct *l, struct Thread *threading ) { printf0("inverting doublet operator\n"); } #endif - vector_double_alloc( &solution, _INNER, 1, l, threading ); - vector_double_alloc( &source, _INNER, 1, l, threading ); + vector_double_alloc( &solution, _INNER, g.num_rhs_vect, l, threading ); + vector_double_alloc( &source, _INNER, g.num_rhs_vect, l, threading ); rhs_define( &source, l, threading ); - + if(g.bc==2) - apply_twisted_bc_to_vector_double( &source, &source, g.twisted_bc, l); - - norm = global_norm_double( &source, 0, l->inner_vector_size, l, threading ); - printf0("source vector norm: %le\n",norm); + apply_twisted_bc_to_vector_double( &source, &source, g.twisted_bc, l); + for( int i=0; iinner_vector_size, l, threading ); + norm = global_norm_double( &source, source.size*i, source.size*(i+1), l, threading ); + printf0("source vector norm: %le\n",norm); + } #ifdef HAVE_TM1p1 if( g.n_flavours == 1 ) #endif #ifdef HAVE_TM - if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) - if(g.downprop) { + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + if(g.downprop) { - START_MASTER(threading) - printf0("\n\n+--------------------------- up ---------------------------+\n\n"); - END_MASTER(threading) + START_MASTER(threading) + printf0("\n\n+--------------------------- up ---------------------------+\n\n"); + END_MASTER(threading) - solve( &solution, &source, l, threading ); + solve( &solution, &source, l, threading ); - if(g.bc==2) - apply_twisted_bc_to_vector_double( &solution, &solution, minus_twisted_bc, l); + if(g.bc==2) + apply_twisted_bc_to_vector_double( &solution, &solution, minus_twisted_bc, l); - START_LOCKED_MASTER(threading) - printf0("\n\n+-------------------------- down --------------------------+\n\n"); - g.mu*=-1; - g.mu_odd_shift*=-1; - g.mu_even_shift*=-1; - END_LOCKED_MASTER(threading) + START_LOCKED_MASTER(threading) + printf0("\n\n+-------------------------- down --------------------------+\n\n"); + g.mu*=-1; + g.mu_odd_shift*=-1; + g.mu_even_shift*=-1; + END_LOCKED_MASTER(threading) - tm_term_update( g.mu, l, threading ); - finalize_operator_update( l, threading ); - } + tm_term_update( g.mu, l, threading ); + finalize_operator_update( l, threading ); + } #endif solve( &solution, &source, l, threading ); if(g.bc==2) apply_twisted_bc_to_vector_double( &solution, &solution, minus_twisted_bc, l); - - norm = global_norm_double( &solution, 0, l->inner_vector_size, l, threading ); - printf0("solution vector norm: %le\n",norm); + + for( int i=0; iinner_vector_size, l, threading ); + norm = global_norm_double( &solution, solution.size*i, solution.size*(i+1), l, threading ); + printf0("solution vector norm: %le\n",norm); + } vector_double_free( &solution, l, threading ); vector_double_free( &source, l, threading ); diff --git a/src/vector_generic.c b/src/vector_generic.c index 9f40f44..3d28cf9 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -30,11 +30,17 @@ void vector_PRECISION_init( vector_PRECISION *vec ) { void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, Thread *threading ) { switch (type){ - case _ORDINARY : PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->vector_size*num_vect ); + case _ORDINARY : + PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->vector_size*num_vect ); + vec->size = l->vector_size; break; - case _SCHWARZ : PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*num_vect ); + case _SCHWARZ : + PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*num_vect ); + vec->size = l->schwarz_vector_size; break; - case _INNER: PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*num_vect ); + case _INNER: + PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*num_vect ); + vec->size = l->inner_vector_size; break; } @@ -45,7 +51,7 @@ void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect } -void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *threading ) { +void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, struct Thread *threading ) { switch (vec->type){ case _ORDINARY : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->vector_size*vec->num_vect ); @@ -83,6 +89,26 @@ void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, in } +void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, (phi->size)*(phi->num_vect), &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0) + PROF_PRECISION_START( _SET ); + + if ( phi->vector_buffer != NULL ) { + int i; + for ( i=start; ivector_buffer[i] = value; + } else { + error0("Error in \"vector_PRECISION_define\": pointer is null\n"); + } + if(thread == 0) + PROF_PRECISION_STOP( _SET, 1 ); +} + + void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { @@ -156,7 +182,25 @@ void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, z_pt[i+j] = x_pt[i+j]; if(thread == 0 && start != end) - PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); + PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); +} + + +void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading ) { + + if(z == x) return; + + int start, end; + compute_core_start_end(0, (x->size)*(x->num_vect), &start, &end, l, threading); + buffer_PRECISION z_pt=z->vector_buffer, x_pt=x->vector_buffer; + int thread = omp_get_thread_num(); + if(thread == 0) + PROF_PRECISION_START( _CPY ); + + VECTOR_FOR( int i=start, iinner_vector_size ); } @@ -174,7 +218,7 @@ void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2 } -void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, Thread *threading ) { +void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, struct Thread *threading ) { if(vec_in->layout==layout) return; @@ -255,7 +299,7 @@ void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ) for(int i=0; i<3; i++){ vector_PRECISION_free( &vp[i], l, threading ); } - if ( l->level == 0 ) + if ( l->level == 0 && g.method == 0) return; else vector_PRECISION_test_routine(l->next_level, threading); diff --git a/src/vector_generic.h b/src/vector_generic.h index 1ca720e..c98148f 100644 --- a/src/vector_generic.h +++ b/src/vector_generic.h @@ -25,14 +25,16 @@ struct Thread; void vector_PRECISION_init( vector_PRECISION *vec ); - void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, Thread *threading ); + void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, struct Thread *threading ); void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *threading); void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); + void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading ); void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x + void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading ); void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2 ); - void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, Thread *threading ); + void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, struct Thread *threading ); void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ); #endif From 53c64833c5bc0faefbed5ab19a985bb6e1dd3450 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Tue, 14 Aug 2018 10:01:46 +0300 Subject: [PATCH 21/31] Modified timer to get total time + individual times for each vector --- src/linsolve_generic.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index 9e3e036..863ca47 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -235,7 +235,13 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread complex_PRECISION gamma0 = 0; complex_PRECISION beta = 0; + PRECISION tt0=0, tt1=0; + START_LOCKED_MASTER(threading) + if ( l->depth == 0 ) tt0 = MPI_Wtime(); + END_LOCKED_MASTER(threading) + + for( n_vec=0; n_vecv_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); @@ -428,6 +432,14 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread END_MASTER(threading) } } + START_LOCKED_MASTER(threading) + if ( l->depth == 0 ) tt1 = MPI_Wtime(); + if ( p->print ) { + printf0("+----------------------------------------------------------+\n"); + printf0("| total elapsed wall clock time: %-8.4lf seconds |\n", tt1-tt0 ); + printf0("+----------------------------------------------------------+\n"); + } + END_LOCKED_MASTER(threading) return iter; } From d070d61b5d6cc3ce9357de587b5d2bcc7673c03b Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 20 Aug 2018 10:28:35 +0300 Subject: [PATCH 22/31] Vectorized version method 0 (minus apply_operator) --- src/init.c | 1 + src/linalg_generic.c | 135 ++++++++++++++++++++-- src/linalg_generic.h | 8 +- src/linsolve_generic.c | 246 +++++++++++++++++++++++++---------------- src/linsolve_generic.h | 6 +- src/top_level.c | 12 +- src/vector_generic.c | 55 ++++++++- src/vector_generic.h | 2 + 8 files changed, 344 insertions(+), 121 deletions(-) diff --git a/src/init.c b/src/init.c index 31426b4..c417f51 100644 --- a/src/init.c +++ b/src/init.c @@ -682,6 +682,7 @@ void g_init( level_struct *l ) { g.cur_storage = 0; g.max_storage = 0; g.in_setup = 0; + g.num_rhs_vect = 0; } void read_global_info( FILE *in ) { diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 94be137..c9d1c4d 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -111,8 +111,6 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { - //vector_PRECISION_check_comp( phi, psi ); - PROF_PRECISION_START( _PIP, threading ); int i; for(int c=0; cinner_vector_size, threading ); } + + + +void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, + level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, psi->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _PIP, threading ); + + int i, j; + for(int c=0; cnum_vect; c++) + results[c] = 0.0; + + vector_PRECISION_change_layout( psi, psi, _LV_SV_NV, no_threading ); + for(int c=0; cnum_vect; j++) + results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; + + vector_PRECISION_change_layout( psi, psi, _NV_LV_SV, no_threading ); + for(int c=0; cinner_vector_size, threading ); +} + #endif @@ -253,11 +284,33 @@ PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level return (PRECISION)sqrt((double)local_alpha); } +void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + + int i, j; + for( j=0; jnum_vect; j++){ + res[j]=0; + } + + //START_MASTER(threading) + vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + for( i=start; inum_vect; j++){ + res[j] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j]); + } + vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + for( j=0; jnum_vect; j++){ + res[j] = (PRECISION)sqrt((double)res[j]); + } + //END_MASTER(threading) +} + void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { - //vector_PRECISION_check_comp( x, y ); - int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); @@ -270,8 +323,6 @@ void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRE void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { - - //vector_PRECISION_check_comp( x, y ); int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -283,11 +334,33 @@ void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PR PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); } + +void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { + + int i, j, start, end; + compute_core_start_end(0, y->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA2 ); + + vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + for( i=start; inum_vect; j++){ + z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - y->vector_buffer[i*x->num_vect+j]; + } + vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); +} + #ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { - //vector_PRECISION_check_comp( z, x ); - int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); @@ -297,6 +370,27 @@ void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_P if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); } + +void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ) { + + int i, j, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA6 ); + + vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + for( i=start; inum_vect; j++){ + z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j]; + } + vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); +} #endif @@ -332,8 +426,6 @@ void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int star #ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) { - //vector_PRECISION_check_comp( x, y ); - int thread = omp_get_thread_num(); if (thread == 0 && start != end ) PROF_PRECISION_START( _LA8 ); @@ -343,6 +435,29 @@ void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PR if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); } + +void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ) { + + int i, j, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if (thread == 0 && start != end ) + PROF_PRECISION_START( _LA8 ); + + vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + for( i=start; inum_vect; j++){ + z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + sign*alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; + } + vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + + if( thread == 0 && start != end ) + PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); +} #endif #ifndef OPTIMIZED_LINALG_PRECISION diff --git a/src/linalg_generic.h b/src/linalg_generic.h index aa3055d..712ad36 100644 --- a/src/linalg_generic.h +++ b/src/linalg_generic.h @@ -104,17 +104,23 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ); + void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, + level_struct *l, struct Thread *threading ); PRECISION global_norm_PRECISION( vector_PRECISION *phi, int start, int end, level_struct *l, struct Thread *threading ); PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ); - + void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struct *l, struct Thread *threading ); + complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ); void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x + y void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x - y + void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ); void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x + void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ); void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y + void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ); void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ); // z := x void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ); diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index 863ca47..b780cf3 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -62,7 +62,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION t *********************************************************************************/ long int total=0; - int i, k=0, n_vl=g.num_rhs_vect;//, n_vl2=1; + int i, k=0, n_vl=g.num_rhs_vect; p->restart_length = m; p->num_restart = n; @@ -73,11 +73,10 @@ void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION t #ifdef HAVE_TM1p1 n_vl*=2; - //n_vl2=2; #endif if(m > 0) { - total += (m+1)*m; // Hessenberg matrix + total += (m+1)*m*n_vl; // Hessenberg matrix MALLOC( p->H, complex_PRECISION*, m ); MALLOC( p->V, vector_PRECISION, m+1 ); @@ -100,7 +99,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION t #endif } - total += 4*(m+1); // y, gamma, c, s + total += 4*(m+1)*n_vl; // y, gamma, c, s p->H[0] = NULL; // allocate connected memory MALLOC( p->H[0], complex_PRECISION, total ); @@ -111,17 +110,17 @@ void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION t // ordering: H, y, gamma, c, s, w, V, Z, x, r, b // H for ( i=1; iH[i] = p->H[0] + i*(m+1); - total += m*(m+1); + p->H[i] = p->H[0] + i*(m+1)*n_vl; + total += m*(m+1)*n_vl; // y - p->y = p->H[0] + total; total += m+1; + p->y = p->H[0] + total; total += (m+1)*n_vl; // gamma - p->gamma = p->H[0] + total; total += m+1; + p->gamma = p->H[0] + total; total += (m+1)*n_vl; // c - p->c = p->H[0] + total; total += m+1; + p->c = p->H[0] + total; total += (m+1)*n_vl; // s - p->s = p->H[0] + total; total += m+1; + p->s = p->H[0] + total; total += (m+1)*n_vl; // w vector_PRECISION_alloc( &(p->w), vl_type, n_vl, l, no_threading ); // V @@ -231,18 +230,23 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread int end; //int j=-1, finish=0, iter=0, il, ol, res, n_vec=0; - int iter=0, il, ol, res, n_vec=0; - complex_PRECISION gamma0 = 0; + int iter=0, il, ol, res, n_vect=g.num_rhs_vect, i, n_vec; + complex_PRECISION gamma0[n_vect];//gamma0 = 0; - complex_PRECISION beta = 0; - PRECISION tt0=0, tt1=0; + PRECISION beta[n_vect];//complex_PRECISION beta = 0; + /*PRECISION tt0=0, tt1=0; START_LOCKED_MASTER(threading) if ( l->depth == 0 ) tt0 = MPI_Wtime(); END_LOCKED_MASTER(threading) - - for( n_vec=0; n_vecdepth==0 && ( p->timing || p->print ) ) prof_init( l ); @@ -259,47 +263,59 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads int j=-1, finish=0; iter = 0; - compute_core_start_end(p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); - printf0("n_vec=%d\n", n_vec); + //compute_core_start_end(p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); + //printf0("n_vec=%d\n", n_vec); + SYNC_CORES(threading) for( ol=0; olnum_restart && finish==0; ol++ ) { if( ol == 0 && p->initial_guess_zero ) { res = _NO_RES; - vector_PRECISION_copy( &(p->r), &(p->b), start, end, l ); - //vector_PRECISION_copy_new( &(p->r), &(p->b), l, threading ); + //vector_PRECISION_copy( &(p->r), &(p->b), start, end, l ); + vector_PRECISION_copy_new( &(p->r), &(p->b), l, threading ); } else { res = _RES; if ( p->kind == _LEFT && p->preconditioner ) { apply_operator_PRECISION( &(p->Z[0]), &(p->x), p, l, threading ); if ( g.method == 5 ) { START_LOCKED_MASTER(threading) - g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); + //g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); END_LOCKED_MASTER(threading) } p->preconditioner( &(p->w), NULL, &(p->Z[0]), _NO_RES, l, threading ); } else { //apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x - apply_operator_PRECISION_new( &(p->w), &(p->x), n_vec, p, l, threading ); + for( n_vec=0; n_vecw), &(p->x), n_vec, p, l, threading ); } - vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w + //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w + vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading ); } //gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) - gamma0 = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); + global_norm_PRECISION_new( gamma0_real, &(p->r), l, threading ); + for( i=0; igamma[0] = gamma0; + //p->gamma[0] = gamma0; + for( i=0; igamma[i] = gamma0[i]; END_MASTER(threading); SYNC_MASTER_TO_ALL(threading); if ( ol == 0 ) { if (l->depth == 0 && !p->initial_guess_zero) { - norm_r0 = global_norm_PRECISION( &(p->b), p->v_start, p->v_end, l, threading ); - printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); + //norm_r0 = global_norm_PRECISION( &(p->b), p->v_start, p->v_end, l, threading ); + global_norm_PRECISION_new( norm_r0, &(p->b), l, threading ); + for( i=0; igamma[0]); + //norm_r0 = creal(p->gamma[0]); + for( i=0; igamma[i]); } } - - vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 + //vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 + vector_PRECISION_real_scale_new( &(p->V[0]), &(p->r), p->gamma, 0, 1, l, threading ); #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, 0, p->preconditioner, p, l, threading ); @@ -310,7 +326,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread j = il; iter++; if ( g.method == 5 ) { START_LOCKED_MASTER(threading) - g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); + //g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); END_LOCKED_MASTER(threading) } @@ -328,32 +344,45 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } } #else - if ( !arnoldi_step_PRECISION_new( p->V, p->Z, &(p->w), p->H, p->y, j, n_vec, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION_new( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } #endif - - if ( cabs( p->H[j][j+1] ) > p->tol/10 ) { + H_tot=0; + for( i=0; iH[j][(j+1)*n_vect+i] ); + + //if ( cabs( p->H[j][j+1] ) > p->tol/10 ) { + if ( H_tot > n_vect*p->tol/10 ) { qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading ); - gamma_jp1 = cabs( p->gamma[j+1] ); + //gamma_jp1 = cabs( p->gamma[(j+1)] ); + for( i=0; igamma[(j+1)*n_vect+i] ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%10 == 0 || p->preconditioner != NULL || l->depth > 0 ) { START_MASTER(threading) if ( p->print && g.print > 0 ) - printf0("| approx. rel. res. after %-6d iterations: %e |\n", iter, gamma_jp1/norm_r0 ); + for( i=0; itol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop + gamma_tot=0; + for( i=0; itol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop + if( gamma_tot < n_vect*p->tol || gamma_tot > n_vect*1E+5 ) { finish = 1; START_MASTER(threading) - if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level ); + if ( gamma_tot > n_vect*1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level ); END_MASTER(threading) } } else { - printf0("depth: %d, iter: %d, p->H(%d,%d) = %+lf+%lfi\n", l->depth, iter, j+1, j, CSPLIT( p->H[j][j+1] ) ); + for( i=0; iH(%d,%d) = %+lf+%lfi\n", i, l->depth, iter, j+1, j, CSPLIT( p->H[j][(j+1)*n_vect+i] ) ); finish = 1; break; } @@ -361,32 +390,41 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread /*compute_solution_PRECISION( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading );*/ compute_solution_PRECISION_new( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), - p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, n_vec, p, l, threading ); + p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading ); } // end of fgmres START_LOCKED_MASTER(threading) - if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_jp1/norm_r0; } + if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_tot ; }//= gamma_jp1/norm_r0; } END_LOCKED_MASTER(threading) if ( p->print ) { #ifdef FGMRES_RESTEST //apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); - apply_operator_PRECISION_new( &(p->w), &(p->x), n_vec, p, l, threading ); - vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); - beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); + for( n_vec=0; n_vecw), &(p->x), n_vec, p, l, threading ); + //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); + vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading ); + //beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); + global_norm_PRECISION_new( beta, &(p->r), l, threading ); #else - beta = gamma_jp1; + for( i=0; i 0 ) printf0("+----------------------------------------------------------+\n\n"); #endif printf0("+----------------------------------------------------------+\n"); printf0("| FGMRES iterations: %-6d coarse average: %-6.2lf |\n", iter, ((double)g.coarse_iter_count)/((double)iter) ); - printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta)/norm_r0 ); - printf0("| soltion for the vector : n_vec = %d |\n", n_vec+1 ); + for( i=0; i 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -402,7 +440,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( l->depth > 0 ) { START_MASTER(threading) char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); - printf0(" - depth: %d, gmres iter: %2d, approx rel res: %le |", l->depth, iter, gamma_jp1/norm_r0 ); + //printf0(" - depth: %d, gmres iter: %2d, approx rel res: %le |", l->depth, iter, gamma_jp1/norm_r0 ); printf0("\033[0m\n"); fflush(0); END_MASTER(threading) } @@ -431,15 +469,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( g.method != 6 ) prof_print( l ); END_MASTER(threading) } - } - START_LOCKED_MASTER(threading) - if ( l->depth == 0 ) tt1 = MPI_Wtime(); - if ( p->print ) { - printf0("+----------------------------------------------------------+\n"); - printf0("| total elapsed wall clock time: %-8.4lf seconds |\n", tt1-tt0 ); - printf0("+----------------------------------------------------------+\n"); - } - END_LOCKED_MASTER(threading) + //} return iter; } @@ -925,7 +955,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, - complex_PRECISION **H, complex_PRECISION* buffer, int j, int n_vec, void (*prec)(), + complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { /********************************************************************************* @@ -1081,12 +1111,13 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector #else SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) - int i; + int i, n_vect=g.num_rhs_vect, n_vec; + PRECISION H_tot; // start and end indices for vector functions depending on thread int start, end; // compute start and end indices for core // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(p->v_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, &start, &end, l, threading); + //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); if ( prec != NULL ) { if ( p->kind == _LEFT ) { @@ -1106,33 +1137,37 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector } } } else { - apply_operator_PRECISION_new( w, &V[j], n_vec, p, l, threading ); // w = D*V[j] + for( n_vec=0; n_vecv_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, l, threading ); + complex_PRECISION tmp[(j+1)*n_vect]; + process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); START_MASTER(threading) for( i=0; i<=j; i++ ) - buffer[i] = tmp[i]; + for( n_vec=0; n_vec 1 ) { PROF_PRECISION_START( _ALLR ); MPI_Allreduce( buffer, H[j], j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); PROF_PRECISION_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) - H[j][i] = buffer[i]; + for( n_vec=0; n_vecv_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, l, threading ); + process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); START_MASTER(threading) for( i=0; i<=j; i++ ) - buffer[i] = tmp[i]; + for( n_vec=0; n_vec 1 ) { PROF_PRECISION_START( _ALLR ); MPI_Allreduce( buffer, tmp, j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); @@ -1140,24 +1175,30 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector } for( i=0; i<=j; i++ ) - H[j][i] += tmp[i]; + for( n_vec=0; n_vecv_start+p->w.size*n_vec, p->v_end+p->w.size*n_vec, l, threading ); + PRECISION tmp2[n_vect]; + global_norm_PRECISION_new( tmp2, w, l, threading ); START_MASTER(threading) - H[j][j+1] = tmp2; + for( n_vec=0; n_vec 1e-15 ) - vector_PRECISION_real_scale( &V[j+1], w, 1/H[j][j+1], start, end, l ); + H_tot=0; + for( i=0; iH[j][(j+1)*n_vect+i] ); + if ( H_tot > n_vect*1e-15 ) + vector_PRECISION_real_scale_new( &V[j+1], w, H[j], j+1, 1, l, threading ); #endif return 1; } @@ -1183,23 +1224,33 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, PROF_PRECISION_START( _SMALL1 ); - int i; - complex_PRECISION beta; + int i, n, n_vect=g.num_rhs_vect; + complex_PRECISION beta[n_vect]; // update QR factorization // apply previous Givens rotation for ( i=0; iv_start+x->size*n_vec, p->v_end+x->size*n_vec, &start, &end, l, threading); + //compute_core_start_end(p->v_start+x->size*n_vec, p->v_end+x->size*n_vec, &start, &end, l, threading); START_MASTER(threading) @@ -1270,11 +1321,14 @@ void compute_solution_PRECISION_new( vector_PRECISION *x, vector_PRECISION *V, c // backward substitution for ( i=j; i>=0; i-- ) { - y[i] = gamma[i]; + for ( n=0; ninner_vector_size, l, threading ); - norm = global_norm_double( &source, source.size*i, source.size*(i+1), l, threading ); - printf0("source vector norm: %le\n",norm); + printf0("source vector %d norm: %le\n",i,norm[i]); } #ifdef HAVE_TM1p1 if( g.n_flavours == 1 ) @@ -192,10 +192,10 @@ void solve_driver( level_struct *l, struct Thread *threading ) { if(g.bc==2) apply_twisted_bc_to_vector_double( &solution, &solution, minus_twisted_bc, l); + global_norm_double_new( norm, &solution, l, threading ); for( int i=0; iinner_vector_size, l, threading ); - norm = global_norm_double( &solution, solution.size*i, solution.size*(i+1), l, threading ); - printf0("solution vector norm: %le\n",norm); + printf0("solution vector %d norm: %le\n",i,norm[i]); } vector_double_free( &solution, l, threading ); diff --git a/src/vector_generic.c b/src/vector_generic.c index 3d28cf9..c435953 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -158,6 +158,45 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, comp } +/* + * opt = 0 : z = alpha*x + * opt = 1 : z = (1/alpha)*x + */ +void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, + int n, int opt, level_struct *l, struct Thread *threading ) { + + vector_PRECISION_check_comp( z, x ); + + int i, j, start, end; + PRECISION r_alpha[x->num_vect]; + + if(opt){ + for( j=0; jnum_vect; j++) + r_alpha[j]=1.0/creal_PRECISION(alpha[n*x->num_vect+j]); + }else{ + for( j=0; jnum_vect; j++) + r_alpha[j]=creal_PRECISION(alpha[n*x->num_vect+j]); + } + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _RS ); + + vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + for( i=start; inum_vect; j++) + z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j]; + + vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _RS, (double)(end-start)/(double)l->inner_vector_size ); +} + + + void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ) { if(z == x) return; @@ -190,14 +229,20 @@ void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_ if(z == x) return; - int start, end; - compute_core_start_end(0, (x->size)*(x->num_vect), &start, &end, l, threading); - buffer_PRECISION z_pt=z->vector_buffer, x_pt=x->vector_buffer; + int i, j, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0) PROF_PRECISION_START( _CPY ); - - VECTOR_FOR( int i=start, inum_vect; j++) + z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j]; + + vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0) PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); diff --git a/src/vector_generic.h b/src/vector_generic.h index c98148f..901e4a2 100644 --- a/src/vector_generic.h +++ b/src/vector_generic.h @@ -31,6 +31,8 @@ void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading ); void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); + void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, + int n, int opt, level_struct *l, struct Thread *threading ); void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading ); void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2 ); From b96ca041f75b1ce80cd7374413ed75859db2a4a9 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Tue, 21 Aug 2018 14:36:40 +0300 Subject: [PATCH 23/31] Vectorized version method 0 (not optimized) --- src/dirac_generic.c | 190 +++++++++++-------- src/dirac_generic.h | 359 +++++++++++++++++++++++++++++++++++- src/init.c | 2 + src/linalg_generic.c | 48 ++--- src/linsolve_generic.c | 23 +-- src/main.c | 2 - src/main_post_def_generic.h | 6 - src/operator_generic.c | 21 ++- src/top_level.c | 13 +- src/vector_generic.c | 16 +- 10 files changed, 529 insertions(+), 151 deletions(-) diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 93536d1..0e5f1e1 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -167,13 +167,12 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR } -void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, int start, int end, +void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ) { - int nv = l->num_lattice_site_var; - int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; - buffer_PRECISION lphi = phi->vector_buffer+start+phi_shift, leta = eta->vector_buffer+start+eta_shift; - buffer_PRECISION leta_end = eta->vector_buffer+end+eta->size*n_vec; + int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j; + buffer_PRECISION lphi = phi->vector_buffer+start*n_vect, leta = eta->vector_buffer+start*n_vect; + buffer_PRECISION leta_end = eta->vector_buffer+end*n_vect; #ifdef PROFILING START_MASTER(threading) PROF_PRECISION_START( _SC ); @@ -187,7 +186,7 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_v if ( g.csw == 0.0 ) { config_PRECISION clover = op->clover+(start/nv)*12; -#ifdef HAVE_TM1p1 +/*#ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { #ifdef HAVE_TM if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) @@ -211,25 +210,41 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_v FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); } } else { -#endif +#endif*/ #ifdef HAVE_TM if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { while ( leta < leta_end ) - FOR12( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); - } else -#endif - while ( leta < leta_end ) - FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); -#ifdef HAVE_TM1p1 + for( i=0; i<12; i++ ){ + for( j=0; jclover+(start/nv)*42; -#ifdef HAVE_TM1p1 +/*#ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { #ifdef HAVE_TM if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) @@ -251,22 +266,29 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_v clover+=42; } } else { -#endif +#endif*/ #ifdef HAVE_TM if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) while ( leta < leta_end ) { - site_clover_PRECISION( leta, lphi, clover ); - FOR12( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + site_clover_PRECISION_new( leta, lphi, clover ); + for( i=0; i<12; i++ ){ + for( j=0; jepsbar_term+(start/nv)*12; lphi = phi->vector_buffer+start+phi_shift, leta = eta->vector_buffer+start+eta_shift; @@ -302,7 +324,7 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_v lphi += 6; } #endif - +*/ #ifdef PROFILING START_MASTER(threading) @@ -512,7 +534,7 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper SYNC_MASTER_TO_ALL(threading) - clover_PRECISION(eta, phi, op, start, end, l, threading ); + clover_PRECISION( eta, phi, op, start, end, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _NC ); @@ -765,9 +787,9 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper } -void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var; + int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var, n_vect = g.num_rhs_vect; #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; @@ -775,17 +797,20 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int i, j, *nb_pt; buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; - int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; + //int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; #endif compute_core_start_end(0, nv*n, &start, &end, l, threading ); + //vector_PRECISION_change_layout( phi, phi, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( eta, eta, _LV_SV_NV, no_threading ); + SYNC_MASTER_TO_ALL(threading) - clover_PRECISION_new(eta, phi, n_vec, op, start, end, l, threading ); + clover_PRECISION_new( eta, phi, op, start, end, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _NC ); END_MASTER(threading) - +/* #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION @@ -911,16 +936,16 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, #endif } else { #endif - +*/ #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION prp_PRECISION( prn, phi->vector_buffer, start, end ); #else - complex_PRECISION pbuf[6]; - for ( i=start/2, phi_pt=phi->vector_buffer+start+phi_shift; iprnT+i, phi_pt ); - prp_Z_PRECISION( op->prnZ+i, phi_pt ); - prp_Y_PRECISION( op->prnY+i, phi_pt ); - prp_X_PRECISION( op->prnX+i, phi_pt ); + complex_PRECISION pbuf[6*n_vect]; + for ( i=start*n_vect/2, phi_pt=phi->vector_buffer+start*n_vect; iprnT+i, phi_pt ); + prp_Z_PRECISION_new( op->prnZ+i, phi_pt ); + prp_Y_PRECISION_new( op->prnY+i, phi_pt ); + prp_X_PRECISION_new( op->prnX+i, phi_pt ); } #endif // start communication in negative direction @@ -935,27 +960,27 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); #else - for ( phi_pt=phi->vector_buffer+start+phi_shift, end_pt=phi->vector_buffer+end+phi_shift, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptvector_buffer+start*n_vect, end_pt=phi->vector_buffer+end*n_vect, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpT+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_T_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpT+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpT+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; // Z dir - j = 6*(*nb_pt); nb_pt++; - prn_Z_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpZ+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_Z_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpZ+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpZ+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; // Y dir - j = 6*(*nb_pt); nb_pt++; - prn_Y_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpY+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_Y_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpY+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpY+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; // X dir - j = 6*(*nb_pt); nb_pt++; - prn_X_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpX+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_X_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpX+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpX+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; } #endif @@ -976,27 +1001,27 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION su3_pbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); #else - for ( eta_pt=eta->vector_buffer+start+eta_shift, end_pt=eta->vector_buffer+end+eta_shift, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptvector_buffer+start*n_vect, end_pt=eta->vector_buffer+end*n_vect, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnT+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 ); - pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnT+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnT+j+3*n_vect ); + pbp_su3_T_PRECISION_new( pbuf, eta_pt ); D_pt += 9; // Z dir - j = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnZ+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 ); - pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnZ+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnZ+j+3*n_vect ); + pbp_su3_Z_PRECISION_new( pbuf, eta_pt ); D_pt += 9; // Y dir - j = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnY+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 ); - pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnY+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnY+j+3*n_vect ); + pbp_su3_Y_PRECISION_new( pbuf, eta_pt ); D_pt += 9; // X dir - j = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnX+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); - pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnX+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnX+j+3*n_vect ); + pbp_su3_X_PRECISION_new( pbuf, eta_pt ); D_pt += 9; } #endif @@ -1012,17 +1037,20 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, #ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION pbn_PRECISION( eta->vector_buffer, prp, start, end ); #else - for ( i=start/2, eta_pt=eta->vector_buffer+start+eta_shift; iprpT+i, eta_pt ); - pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); - pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); - pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + for ( i=start*n_vect/2, eta_pt=eta->vector_buffer+start*n_vect; iprpT+i, eta_pt ); + pbn_su3_Z_PRECISION_new( op->prpZ+i, eta_pt ); + pbn_su3_Y_PRECISION_new( op->prpY+i, eta_pt ); + pbn_su3_X_PRECISION_new( op->prpX+i, eta_pt ); } #endif -#ifdef HAVE_TM1p1 +/*#ifdef HAVE_TM1p1 } -#endif - +#endif*/ + + //vector_PRECISION_change_layout( phi, phi, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( eta, eta, _NV_LV_SV, no_threading ); + START_MASTER(threading) PROF_PRECISION_STOP( _NC, 1 ); END_MASTER(threading) diff --git a/src/dirac_generic.h b/src/dirac_generic.h index fdcb4b3..a522cb8 100644 --- a/src/dirac_generic.h +++ b/src/dirac_generic.h @@ -29,10 +29,10 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); - void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); + void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, int n_vec, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void d_plus_clover_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); @@ -108,6 +108,21 @@ eta[2] += D[7]*phi[1]; eta[2] += D[8]*phi[2]; } + + static inline void mvm_PRECISION_new( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { + int n_vect = g.num_rhs_vect; + for( int i=0; inum_vect; c++) results[c] = 0.0; - vector_PRECISION_change_layout( psi, psi, _LV_SV_NV, no_threading ); - for(int c=0; cnum_vect; j++) results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; - vector_PRECISION_change_layout( psi, psi, _NV_LV_SV, no_threading ); - for(int c=0; cinner_vector_size, threading ); @@ -296,12 +296,12 @@ void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struc } //START_MASTER(threading) - vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ res[j] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j]); } - vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); for( j=0; jnum_vect; j++){ res[j] = (PRECISION)sqrt((double)res[j]); } @@ -343,16 +343,16 @@ void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vecto if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - y->vector_buffer[i*x->num_vect+j]; } - vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -379,14 +379,14 @@ void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, compl if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); - vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j]; } - vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); @@ -444,16 +444,16 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto if (thread == 0 && start != end ) PROF_PRECISION_START( _LA8 ); - vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + sign*alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; } - vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index b780cf3..f40ab7d 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -232,14 +232,9 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread //int j=-1, finish=0, iter=0, il, ol, res, n_vec=0; int iter=0, il, ol, res, n_vect=g.num_rhs_vect, i, n_vec; complex_PRECISION gamma0[n_vect];//gamma0 = 0; - + PRECISION beta[n_vect];//complex_PRECISION beta = 0; - /*PRECISION tt0=0, tt1=0; - START_LOCKED_MASTER(threading) - if ( l->depth == 0 ) tt0 = MPI_Wtime(); - END_LOCKED_MASTER(threading) - */ //for( n_vec=0; n_vecv_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); - //printf0("n_vec=%d\n", n_vec); SYNC_CORES(threading) for( ol=0; olnum_restart && finish==0; ol++ ) { @@ -283,9 +277,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } p->preconditioner( &(p->w), NULL, &(p->Z[0]), _NO_RES, l, threading ); } else { - //apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x - for( n_vec=0; n_vecw), &(p->x), n_vec, p, l, threading ); + apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x } //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading ); @@ -399,9 +391,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( p->print ) { #ifdef FGMRES_RESTEST - //apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); - for( n_vec=0; n_vecw), &(p->x), n_vec, p, l, threading ); + apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading ); //beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); @@ -1137,8 +1127,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector } } } else { - for( n_vec=0; n_vec 1 ) { PROF_PRECISION_START( _ALLR ); - MPI_Allreduce( buffer, H[j], j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); PROF_PRECISION_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) @@ -1170,7 +1159,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector buffer[i*n_vect+n_vec] = tmp[i*n_vect+n_vec]; if ( g.num_processes > 1 ) { PROF_PRECISION_START( _ALLR ); - MPI_Allreduce( buffer, tmp, j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + MPI_Allreduce( buffer, tmp, (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); PROF_PRECISION_STOP( _ALLR, 1 ); } diff --git a/src/main.c b/src/main.c index a24adf8..8b72c2c 100644 --- a/src/main.c +++ b/src/main.c @@ -58,8 +58,6 @@ int main( int argc, char **argv ) { method_init( &argc, &argv, &l ); - g.num_rhs_vect = 4; - no_threading = (struct Thread *)malloc(sizeof(struct Thread)); setup_no_threading(no_threading, &l); diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h index e7d1bc6..4817c43 100644 --- a/src/main_post_def_generic.h +++ b/src/main_post_def_generic.h @@ -32,12 +32,6 @@ } - static inline void apply_operator_PRECISION_new( vector_PRECISION *output, vector_PRECISION *input, int n, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { - - p->eval_operator( output, input, n, p->op, l, threading ); - - } - static inline void apply_operator_dagger_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 diff --git a/src/operator_generic.c b/src/operator_generic.c index f08a44c..76552b0 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -71,7 +71,7 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op, // when used as preconditioner we usually do not need the projection buffers, unless // g.method >= 4: then oddeven_setup_float() is called in init.c, method_setup(). if ( l->depth == 0 ) { - int its = (l->num_lattice_site_var/2)*l->num_lattice_sites; + int its = (l->num_lattice_site_var/2)*l->num_lattice_sites*g.num_rhs_vect; #ifdef HAVE_TM1p1 its *= 2; #endif @@ -83,7 +83,7 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op, void operator_PRECISION_free_projection_buffers( operator_PRECISION_struct *op, level_struct *l ) { if ( l->depth == 0 ) { - int its = (l->num_lattice_site_var/2)*l->num_lattice_sites; + int its = (l->num_lattice_site_var/2)*l->num_lattice_sites*g.num_rhs_vect; #ifdef HAVE_TM1p1 its *= 2; #endif @@ -409,21 +409,24 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc START_LOCKED_MASTER(threading) vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); - apply_operator_double_new( &vd[1], &vd[0], 0, &(g.p), l, no_threading ); - + //apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading ); + trans_PRECISION( &vp[0], &vd[0], op->translation_table, l, no_threading ); - apply_operator_PRECISION_new( &vp[1], &vp[0], 0, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); + //apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + //trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); - vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + //vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + //diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/ + // global_norm_double( &vd[2], 0, ivs, l, no_threading ); + vector_double_minus( &vd[3], &vd[0], &vd[0], 0, l->inner_vector_size, l ); diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/ - global_norm_double( &vd[2], 0, ivs, l, no_threading ); + global_norm_double( &vd[0], 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - apply_operator_PRECISION_new( &vp[1], &vp[0], 0, &(l->p_PRECISION), l, threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) diff --git a/src/top_level.c b/src/top_level.c index 16d4bdb..35db96c 100644 --- a/src/top_level.c +++ b/src/top_level.c @@ -84,7 +84,9 @@ int wilson_driver( vector_double *solution, vector_double *source, level_struct for ( int i=0; i<100; i++ ) { double tmp_t = -MPI_Wtime(); #endif - + vector_double_change_layout( &sol, &sol, _LV_SV_NV, no_threading ); + vector_double_change_layout( &rhs, &rhs, _LV_SV_NV, no_threading ); + //vector_double_copy( &rhs, source, start, end, l ); vector_double_copy_new( &rhs, source, l, threading ); if ( g.method == -1 ) { @@ -109,6 +111,9 @@ int wilson_driver( vector_double *solution, vector_double *source, level_struct END_MASTER(threading) #endif + vector_double_change_layout( &sol, &sol, _NV_LV_SV, no_threading ); + vector_double_change_layout( &rhs, &rhs, _NV_LV_SV, no_threading ); + return iter; } @@ -151,6 +156,9 @@ void solve_driver( level_struct *l, struct Thread *threading ) { rhs_define( &source, l, threading ); + vector_double_change_layout( &solution, &solution, _LV_SV_NV, no_threading ); + vector_double_change_layout( &source, &source, _LV_SV_NV, no_threading ); + if(g.bc==2) apply_twisted_bc_to_vector_double( &source, &source, g.twisted_bc, l); @@ -198,6 +206,9 @@ void solve_driver( level_struct *l, struct Thread *threading ) { printf0("solution vector %d norm: %le\n",i,norm[i]); } + vector_double_change_layout( &solution, &solution, _NV_LV_SV, no_threading ); + vector_double_change_layout( &source, &source, _NV_LV_SV, no_threading ); + vector_double_free( &solution, l, threading ); vector_double_free( &source, l, threading ); diff --git a/src/vector_generic.c b/src/vector_generic.c index c435953..5f383ac 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -182,14 +182,14 @@ void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, if(thread == 0 && start != end) PROF_PRECISION_START( _RS ); - vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++) z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j]; - vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _RS, (double)(end-start)/(double)l->inner_vector_size ); @@ -235,14 +235,14 @@ void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_ if(thread == 0) PROF_PRECISION_START( _CPY ); - vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++) z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j]; - vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0) PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); From e802deaf1733067af3fcf218563752db241e78a5 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Thu, 23 Aug 2018 10:34:29 +0300 Subject: [PATCH 24/31] Vectorized version method 0 (single precision and MP) --- src/DDalphaAMG_interface.c | 12 +- src/init.c | 36 +++-- src/linalg.c | 48 +++++++ src/linalg.h | 16 ++- src/linalg_generic.c | 64 ++++++++- src/linalg_generic.h | 1 + src/linsolve.c | 274 ++++++++++++++++++++++++++++++------- src/linsolve.h | 10 +- src/linsolve_generic.c | 38 +++-- src/main.h | 2 +- src/operator_generic.c | 29 ++-- src/schwarz_generic.c | 73 ++++++++++ src/schwarz_generic.h | 2 + src/vector_generic.c | 2 +- 14 files changed, 502 insertions(+), 105 deletions(-) diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c index 9f8b45b..da3b343 100644 --- a/src/DDalphaAMG_interface.c +++ b/src/DDalphaAMG_interface.c @@ -749,8 +749,9 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; } - } + #endif + } if(mu%2) j+=6; } @@ -781,8 +782,9 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; } - } + #endif + } } } } @@ -1128,8 +1130,9 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; } - } + #endif + } if(mu%2) j+=6; } @@ -1151,8 +1154,9 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i if( vtmp > EPS_double && vtmp < vmin ) vmin = vtmp; } - } + #endif + } } } } diff --git a/src/init.c b/src/init.c index 0927148..7015fa6 100644 --- a/src/init.c +++ b/src/init.c @@ -157,11 +157,15 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 - MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); - MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, 2*g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); #else - MALLOC( g.p.b, complex_double, l->inner_vector_size ); - MALLOC( g.p.x, complex_double, l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, l->inner_vector_size ); #endif #endif #ifdef INIT_ONE_PREC @@ -183,11 +187,15 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 - MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); - MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, 2*g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); #else - MALLOC( g.p.b, complex_double, l->inner_vector_size ); - MALLOC( g.p.x, complex_double, l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, l->inner_vector_size ); #endif #endif #ifdef INIT_ONE_PREC @@ -367,11 +375,15 @@ void method_free( level_struct *l ) { fgmres_MP_struct_free( &(g.p_MP), l ); #if defined (INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 - FREE( g.p.b, complex_double, 2*l->inner_vector_size ); - FREE( g.p.x, complex_double, 2*l->inner_vector_size ); + vector_double_free( &(g.p.b), l, no_threading ); + vector_double_free( &(g.p.x), l, no_threading ); + //FREE( g.p.b, complex_double, 2*l->inner_vector_size ); + //FREE( g.p.x, complex_double, 2*l->inner_vector_size ); #else - FREE( g.p.b, complex_double, l->inner_vector_size ); - FREE( g.p.x, complex_double, l->inner_vector_size ); + vector_double_free( &(g.p.b), l, no_threading ); + vector_double_free( &(g.p.x), l, no_threading ); + //FREE( g.p.b, complex_double, l->inner_vector_size ); + //FREE( g.p.x, complex_double, l->inner_vector_size ); #endif #endif #ifdef INIT_ONE_PREC diff --git a/src/linalg.c b/src/linalg.c index 06d3961..8b8fef8 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -60,6 +60,30 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } + + +void process_multi_inner_product_MP_new( int count, complex_double *results, vector_float *phi, + vector_float *psi, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, psi->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_float_START( _PIP, threading ); + + int i, j; + for(int c=0; cnum_vect; c++) + results[c] = 0.0; + + for(int c=0; cnum_vect; j++) + results[c*psi->num_vect+j] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; + + if(thread == 0 && start != end) + PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); +} #endif double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) { @@ -109,3 +133,27 @@ double global_norm_MP( vector_float *x, int start, int end, level_struct *l, str return sqrt((double)local_alpha); } } + +void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_float_START( _GIP, threading ); + + int i, j; + for( j=0; jnum_vect; j++){ + res[j]=0; + } + + for( i=start; inum_vect; j++){ + res[j] += NORM_SQUARE_float(x->vector_buffer[i*x->num_vect+j]); + } + for( j=0; jnum_vect; j++){ + res[j] = (double)sqrt((double)res[j]); + } + if(thread == 0 && start != end) + PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); +} diff --git a/src/linalg.h b/src/linalg.h index 1343ced..80e9514 100644 --- a/src/linalg.h +++ b/src/linalg.h @@ -29,11 +29,21 @@ void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float *alpha, int sign, int count, int start, int end, level_struct *l ); - + + void vector_double_multi_saxpy_new( vector_double *z, vector_double *V, complex_double *alpha, + int sign, int count, level_struct *l, struct Thread *threading ); + + void vector_float_multi_saxpy_new( vector_float *z, vector_float *V, complex_float *alpha, + int sign, int count, level_struct *l, struct Thread *threading ); + void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ); - - double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ); + void process_multi_inner_product_MP_new( int count, complex_double *results, vector_float *phi, + vector_float *psi, level_struct *l, struct Thread *threading ); + + double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ); + + void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 0d79210..ea2c394 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -178,6 +178,7 @@ void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *re for(int c=0; cnum_vect; j++) results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; @@ -322,6 +323,25 @@ void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRE } +void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { + + int i, j, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA2 ); + + for( i=start; inum_vect; j++){ + z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + y->vector_buffer[i*x->num_vect+j]; + } + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); +} + + void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); @@ -347,6 +367,7 @@ void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vecto //vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - y->vector_buffer[i*x->num_vect+j]; } @@ -382,6 +403,7 @@ void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, compl //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j]; } @@ -447,10 +469,17 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); - for( i=start; inum_vect; j++){ - z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + sign*alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; - } + if( sign == 1 ) + for( i=start; inum_vect; j++) + z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; + else + for( i=start; inum_vect; j++) + z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); @@ -482,6 +511,33 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, com if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); } + +void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, + int sign, int count, level_struct *l, struct Thread *threading ) { + + int i, j, start, end; + compute_core_start_end(0, z->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if (thread == 0 && start != end ) + PROF_PRECISION_START( _LA8 ); + + complex_PRECISION alpha_signed[count*z->num_vect]; + for ( int c=0; cnum_vect; j++) + alpha_signed[c*z->num_vect+j] = sign*alpha[c*z->num_vect+j]; + } + + for ( int c=0; cnum_vect; j++) + z->vector_buffer[i*z->num_vect+j] += V[c].vector_buffer[i*z->num_vect+j]*alpha_signed[c]; + } + } + + if( thread == 0 && start != end ) + PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); +} #endif void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, diff --git a/src/linalg_generic.h b/src/linalg_generic.h index 712ad36..9f6f7be 100644 --- a/src/linalg_generic.h +++ b/src/linalg_generic.h @@ -113,6 +113,7 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ); void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x + y + void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ); void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x - y void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ); void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x diff --git a/src/linsolve.c b/src/linsolve.c index 3b08843..4465450 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -31,7 +31,7 @@ void fgmres_MP_struct_init( gmres_MP_struct *p ) { void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const int prec_kind, void (*precond)(), gmres_MP_struct *p, level_struct *l ) { long int total=0; - int i, k=0, n_vl=1; + int i, k=0, n_vl=g.num_rhs_vect; p->dp.restart_length = m; p->sp.restart_length = m; p->dp.num_restart = n; p->sp.num_restart = n; @@ -39,7 +39,7 @@ void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const if ( g.method == 6 ) { p->dp.eval_operator = g5D_plus_clover_double; p->sp.eval_operator = g5D_plus_clover_float; } else { - p->dp.eval_operator = d_plus_clover_double; p->sp.eval_operator = d_plus_clover_float; + p->dp.eval_operator = d_plus_clover_double_new; p->sp.eval_operator = d_plus_clover_float_new; } p->dp.tol = tol; p->sp.tol = MAX(tol,1E-5); p->dp.kind = _NOTHING; p->sp.kind = prec_kind; @@ -56,18 +56,18 @@ void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const if ( g.method == 6 ) { g.p.eval_operator = g5D_plus_clover_double; } else { - g.p.eval_operator = d_plus_clover_double; + g.p.eval_operator = d_plus_clover_double_new; } #ifdef HAVE_TM1p1 - n_vl=2; + n_vl*=2; #endif // double precision part total = 0; - total += (m+1)*m; // Hessenberg matrix + total += (m+1)*m*n_vl; // Hessenberg matrix MALLOC( p->dp.H, complex_double*, m ); - total += 4*(m+1); // y, gamma, c, s + total += 4*(m+1)*n_vl; // y, gamma, c, s p->dp.total_storage = total; // precomputed storage amount @@ -78,16 +78,16 @@ void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const total = 0; // H for ( i=1; idp.H[i] = p->dp.H[0] + i*(m+1); - total += m*(m+1); + p->dp.H[i] = p->dp.H[0] + i*(m+1)*n_vl; + total += m*(m+1)*n_vl; // y - p->dp.y = p->dp.H[0] + total; total += m+1; + p->dp.y = p->dp.H[0] + total; total += (m+1)*n_vl; // gamma - p->dp.gamma = p->dp.H[0] + total; total += m+1; + p->dp.gamma = p->dp.H[0] + total; total += (m+1)*n_vl; // c - p->dp.c = p->dp.H[0] + total; total += m+1; + p->dp.c = p->dp.H[0] + total; total += (m+1)*n_vl; // s - p->dp.s = p->dp.H[0] + total; total += m+1; + p->dp.s = p->dp.H[0] + total; total += (m+1)*n_vl; // x vector_double_alloc( &(p->dp.x), vl_type, n_vl, l, no_threading ); // r @@ -163,11 +163,18 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { int start; int end; - int j=-1, finish=0, iter=0, il, ol; - complex_double gamma0 = 0; - complex_double beta = 0; + int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, n_vec; + complex_double gamma0[n_vect];//gamma0=0; + double beta[n_vect]; //beta=0; - double norm_r0=1, gamma_jp1=1, t0=0, t1=0; + double t0=0, t1=0; + double norm_r0[n_vect], gamma_jp1[n_vect], gamma0_real[n_vect], gamma_tot, H_tot, gamma_tot2;//norm_r0=1, gamma_jp1=1 + complex_float gamma_float[n_vect]; + + for( i=0; idepth==0 && ( p->dp.timing || p->dp.print ) ) prof_init( l ); @@ -182,97 +189,134 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { SYNC_MASTER_TO_ALL(threading) // compute start and end indices for core // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(p->dp.v_start, p->dp.v_end, &start, &end, l, threading); + //compute_core_start_end(p->dp.v_start, p->dp.v_end, &start, &end, l, threading); // Outer loop in double precision for( ol=0; oldp.num_restart && finish==0; ol++ ) { - + if( ol == 0 && p->dp.initial_guess_zero ) { - vector_double_copy( &(p->dp.r), &(p->dp.b), start, end, l ); + //vector_double_copy( &(p->dp.r), &(p->dp.b), start, end, l ); + vector_double_copy_new( &(p->dp.r), &(p->dp.b), l, threading ); } else { apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); // compute r <- D*x - vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); // compute r <- b - r + //vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); // compute r <- b - r + vector_double_minus_new( &(p->dp.r), &(p->dp.b), &(p->dp.r), l, threading ); } - gamma0 = (complex_double) global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) + //gamma0 = (complex_double) global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) + global_norm_double_new( gamma0_real, &(p->dp.r), l, threading ); + for( i=0; idp.gamma[0] = gamma0; + //p->dp.gamma[0] = gamma0; + #pragma vector aligned + for( i=0; idp.gamma[i] = gamma0[i]; END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) if( ol == 0) { if (l->depth == 0 && !p->dp.initial_guess_zero) { - norm_r0 = global_norm_double( &(p->dp.b), start, end, l, threading ); - printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); + //norm_r0 = global_norm_double( &(p->dp.b), start, end, l, threading ); + global_norm_double_new( norm_r0, &(p->dp.b), l, threading ); + for( i=0; idp.print && g.print > 0 ) { START_MASTER(threading) printf0("+----------------------------------------------------------+\n"); - printf0("| restarting ... true residual norm: %6e |\n", creal(gamma0)/norm_r0 ); + for( i=0; isp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); - vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 - + trans_float_new( &(p->sp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); + //vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 + for( i=0; idp.gamma[0*n_vect+i]; + vector_float_real_scale_new( &(p->sp.V[0]), &(p->sp.V[0]), gamma_float, 0, 1, l, threading ); // inner loop in single precision for( il=0; ildp.restart_length && finish==0; il++) { j = il; iter++; - arnoldi_step_MP( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); - - if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) { + arnoldi_step_MP_new( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); + H_tot=0; + for( i=0; idp.H[j][(j+1)*n_vect+i] ); + //if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) + if ( H_tot > n_vect*1E-15 ) { qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading ); - gamma_jp1 = cabs( p->dp.gamma[j+1] ); - + //gamma_jp1 = cabs( p->dp.gamma[j+1] ); + #pragma vector aligned + for( i=0; idp.gamma[(j+1)*n_vect+i] ); + if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) { #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) START_MASTER(threading) if ( p->sp.print && g.print > 0 ) - printf0("| approx. rel. res. after %-6d iterations: %e |\n", iter, gamma_jp1/norm_r0 ); + for( i=0; idp.tol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop + gamma_tot=0; + for( i=0; idp.tol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop + if( gamma_tot < n_vect*p->dp.tol || gamma_tot > n_vect*1E+5 ) { finish = 1; START_MASTER(threading) - if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_MP, iter = %d, level=%d\n", iter, l->level ); + if ( gamma_tot > n_vect*1E+5 ) printf0("Divergence of fgmres_MP, iter = %d, level=%d\n", iter, l->level ); END_MASTER(threading) } - if( gamma_jp1/creal(gamma0) < p->sp.tol ) + gamma_tot2=0; + for( i=0; isp.tol ) + if( gamma_tot2 < n_vect*p->sp.tol ){ break; + } } else { finish = 1; } } // end of a single restart - compute_solution_MP( &(p->sp.w), (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V, + compute_solution_MP_new( &(p->sp.w), (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V, p->dp.y, p->dp.gamma, p->dp.H, j, &(p->sp), l, threading ); - - trans_back_float( &(p->dp.r), &(p->sp.w), l->s_float.op.translation_table, l, threading ); + + trans_back_float_new( &(p->dp.r), &(p->sp.w), l->s_float.op.translation_table, l, threading ); if ( ol == 0 ) { - vector_double_copy( &(p->dp.x), &(p->dp.r), start, end, l ); + //vector_double_copy( &(p->dp.x), &(p->dp.r), start, end, l ); + vector_double_copy_new(&(p->dp.x), &(p->dp.r), l, threading); } else { - vector_double_plus( &(p->dp.x), &(p->dp.x), &(p->dp.r), start, end, l ); + //vector_double_plus( &(p->dp.x), &(p->dp.x), &(p->dp.r), start, end, l ); + vector_double_plus_new( &(p->dp.x), &(p->dp.x), &(p->dp.r), l, threading ); } } // end of fgmres START_LOCKED_MASTER(threading) - if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_jp1/norm_r0; } + if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_tot; } END_LOCKED_MASTER(threading) if ( p->dp.print ) { #ifdef FGMRES_RESTEST apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); - vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); - beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); + //vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); + vector_double_minus_new( &(p->dp.r), &(p->dp.b), &(p->dp.r), l, threading ); + //beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); + global_norm_double_new( beta, &(p->dp.r), l, threading ); #else - beta = gamma_jp1; + for( i=0; i 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -384,6 +429,88 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float *w, } +void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w, + complex_double **H, complex_double* buffer, int j, void (*prec)(), + gmres_float_struct *p, level_struct *l, struct Thread *threading ) { + + SYNC_MASTER_TO_ALL(threading) + SYNC_CORES(threading) + int i, n_vect=g.num_rhs_vect, n_vec; + double H_tot; + complex_float H_float[n_vect]; + // start and end indices for vector functions depending on thread + int start; + int end; + // compute start and end indices for core + // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads + //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + + if ( prec != NULL ) { + if ( p->kind == _LEFT ) { + apply_operator_float( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); + } else { + if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); + // obtains w = D * Z[j] from Schwarz + } else { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_float( w, &Z[j], p, l, threading ); // w = D*Z[j] + } + } + } else { + apply_operator_float( w, &V[j], p, l, threading ); // w = D*V[j] + } + + complex_double tmp[(j+1)*n_vect]; + process_multi_inner_product_MP_new( j+1, tmp, V, w, l, threading ); + START_MASTER(threading) + for( i=0; i<=j; i++ ) + #pragma vector aligned + for( n_vec=0; n_vec 1 ) { + PROF_double_START( _ALLR ); + MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); + PROF_double_STOP( _ALLR, 1 ); + } else { + for( i=0; i<=j; i++ ) + #pragma vector aligned + for( n_vec=0; n_vec n_vect*1e-15 ){ + for( n_vec=0; n_vecv_start, p->v_end, &start, &end, l, threading); + + START_MASTER(threading) + + PROF_double_START( _SMALL2 ); + + // backward substitution + for ( i=j; i>=0; i-- ) { + for ( n=0; nv_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); SYNC_CORES(threading) @@ -289,6 +287,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread START_MASTER(threading) //p->gamma[0] = gamma0; + #pragma vector aligned for( i=0; igamma[i] = gamma0[i]; END_MASTER(threading); @@ -349,9 +348,10 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( H_tot > n_vect*p->tol/10 ) { qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading ); //gamma_jp1 = cabs( p->gamma[(j+1)] ); + #pragma vector aligned for( i=0; igamma[(j+1)*n_vect+i] ); - + #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%10 == 0 || p->preconditioner != NULL || l->depth > 0 ) { START_MASTER(threading) @@ -1135,6 +1135,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); START_MASTER(threading) for( i=0; i<=j; i++ ) + #pragma vector aligned for( n_vec=0; n_vec 1 ) { @@ -1143,6 +1144,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector PROF_PRECISION_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) + #pragma vector aligned for( n_vec=0; n_vecinner_vector_size; - double diff; + int ivs = l->inner_vector_size, n_vect=g.num_rhs_vect; + double diff, diff1[n_vect], diff2[n_vect]; vector_double vd[4]; vector_PRECISION vp[2]; for(int i=0; i<4; i++){ vector_double_init( &vd[i] ); - vector_double_alloc( &vd[i], _INNER, 1, l, threading ); + vector_double_alloc( &vd[i], _INNER, n_vect, l, threading ); } for(int i=0; i<2; i++){ vector_PRECISION_init( &vp[i] ); - vector_PRECISION_alloc( &vp[i], _INNER, 1, l, threading ); + vector_PRECISION_alloc( &vp[i], _INNER, n_vect, l, threading ); } START_LOCKED_MASTER(threading) - vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); - //apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading ); + //vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); + vector_double_define_random_new( &vd[0], l, no_threading ); + apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading ); - trans_PRECISION( &vp[0], &vd[0], op->translation_table, l, no_threading ); - //apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); - //trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); + trans_PRECISION_new( &vp[0], &vd[0], op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION_new( &vd[2], &vp[1], op->translation_table, l, no_threading ); //vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + vector_double_minus_new( &vd[3], &vd[2], &vd[1], l, no_threading ); //diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/ // global_norm_double( &vd[2], 0, ivs, l, no_threading ); - vector_double_minus( &vd[3], &vd[0], &vd[0], 0, l->inner_vector_size, l ); - diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/ - global_norm_double( &vd[0], 0, ivs, l, no_threading ); - - test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff ); + global_norm_double_new( diff1, &vd[3], l, no_threading ); + global_norm_double_new( diff2, &vd[2], l, no_threading ); + + test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff1[n_vect-1]/diff2[n_vect-1] ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index b513f88..fa70c36 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -2208,6 +2208,79 @@ void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, le } +void trans_PRECISION_new( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ) { + + int i, j, k, index; + buffer_PRECISION out_pt = out->vector_buffer; buffer_double in_pt = in->vector_buffer; + int start = threading->start_site[l->depth]; + int end = threading->end_site[l->depth]; + //compute_core_start_end(0, in->size, &start, &end, l, threading); + + // this function seems to do some data reordering, barriers ensure that everything is in sync + SYNC_CORES(threading) + START_NO_HYPERTHREADS(threading) +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + for ( i=start; ivector_buffer + 24*index; + in_pt = in->vector_buffer + 24*i; + FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; ) + } + else +#endif + for ( i=start; ivector_buffer + 12*index*in->num_vect; + in_pt = in->vector_buffer + 12*i*in->num_vect; + for( k=0; k<12; k++) + for( j=0; jnum_vect; j++){ + *out_pt = (complex_PRECISION) *in_pt; + out_pt++; + in_pt++; + } + } + END_NO_HYPERTHREADS(threading) + SYNC_CORES(threading) +} + + +void trans_back_PRECISION_new( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ) { + + int i, j, k, index; + buffer_double out_pt = out->vector_buffer; buffer_PRECISION in_pt = in->vector_buffer; + int start = threading->start_site[l->depth]; + int end = threading->end_site[l->depth]; + + // this function seems to do some data reordering, barriers ensure that everything is in sync + SYNC_CORES(threading) + START_NO_HYPERTHREADS(threading) +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + for ( i=start; ivector_buffer + 24*index; + out_pt = out->vector_buffer + 24*i; + FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; ) + } + else +#endif + for ( i=start; ivector_buffer + 12*index*in->num_vect; + out_pt = out->vector_buffer + 12*i*in->num_vect; + for( k=0; k<12; k++) + for( j=0; jnum_vect; j++){ + *out_pt = (complex_double) *in_pt; + out_pt++; + in_pt++; + } + } + END_NO_HYPERTHREADS(threading) + SYNC_CORES(threading) +} + + void schwarz_PRECISION_def( schwarz_PRECISION_struct *s, operator_double_struct *op, level_struct *l ) { schwarz_PRECISION_alloc( s, l ); diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h index 1fb734a..26333ad 100644 --- a/src/schwarz_generic.h +++ b/src/schwarz_generic.h @@ -57,6 +57,8 @@ struct Thread; void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ); void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ); + void trans_PRECISION_new( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ); + void trans_back_PRECISION_new( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ); void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); diff --git a/src/vector_generic.c b/src/vector_generic.c index 5f383ac..a61631f 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -165,7 +165,7 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, comp void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int n, int opt, level_struct *l, struct Thread *threading ) { - vector_PRECISION_check_comp( z, x ); + //vector_PRECISION_check_comp( z, x ); int i, j, start, end; PRECISION r_alpha[x->num_vect]; From 2ed347ea3dd39f5c013c085e8dae198bd8bc3702 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Thu, 23 Aug 2018 14:05:54 +0300 Subject: [PATCH 25/31] Working MP vectorized method 0 --- src/linalg.c | 5 +++-- src/linalg_generic.c | 40 ++++++---------------------------------- src/linsolve.c | 23 +++++++++++++++++------ src/linsolve_generic.c | 13 ++++++------- src/operator_generic.c | 3 ++- 5 files changed, 34 insertions(+), 50 deletions(-) diff --git a/src/linalg.c b/src/linalg.c index 8b8fef8..e95a937 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -72,8 +72,9 @@ void process_multi_inner_product_MP_new( int count, complex_double *results, vec PROF_float_START( _PIP, threading ); int i, j; - for(int c=0; cnum_vect; c++) - results[c] = 0.0; + for(int c=0; cnum_vect; j++) + results[c*psi->num_vect+j] = 0.0; for(int c=0; cnum_vect; c++) results[c] = 0.0; - //vector_PRECISION_change_layout( psi, psi, _LV_SV_NV, no_threading ); - //for(int c=0; cnum_vect; j++) results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; - //vector_PRECISION_change_layout( psi, psi, _NV_LV_SV, no_threading ); - //for(int c=0; cinner_vector_size, threading ); } @@ -195,8 +183,6 @@ void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *re complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ) { - //vector_PRECISION_check_comp( phi, psi ); - complex_PRECISION numerator = 0.0; PRECISION denominator = 0.0; VECTOR_FOR( int i=start, ivector_buffer[i])*psi->vector_buffer[i]; denominator += NORM_SQUARE_PRECISION(phi->vector_buffer[i]), i++, l ); @@ -290,23 +276,25 @@ void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struc int start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _GIP, threading ); int i, j; for( j=0; jnum_vect; j++){ res[j]=0; } - //START_MASTER(threading) - //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ res[j] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j]); } - //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + for( j=0; jnum_vect; j++){ res[j] = (PRECISION)sqrt((double)res[j]); } - //END_MASTER(threading) + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } @@ -363,17 +351,11 @@ void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vecto if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - //vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); - //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - y->vector_buffer[i*x->num_vect+j]; } - //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - //vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); - //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -400,15 +382,11 @@ void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, compl if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); - //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); for( i=start; inum_vect; j++){ z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j]; } - //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); @@ -466,9 +444,6 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto if (thread == 0 && start != end ) PROF_PRECISION_START( _LA8 ); - //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - //vector_PRECISION_change_layout( y, y, _LV_SV_NV, no_threading ); - //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); if( sign == 1 ) for( i=start; inum_vect; j++) z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; - //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); - //vector_PRECISION_change_layout( y, y, _NV_LV_SV, no_threading ); - //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); diff --git a/src/linsolve.c b/src/linsolve.c index 4465450..ff36e79 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -227,7 +227,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { norm_r0[i]= creal(gamma0[i]); } } -#if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) +/*#if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) else { if ( p->dp.print && g.print > 0 ) { START_MASTER(threading) @@ -238,7 +238,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { END_MASTER(threading) } } -#endif +#endif*/ trans_float_new( &(p->sp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); //vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 for( i=0; i n_vect*1e-15 ){ for( n_vec=0; n_vecv_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); SYNC_CORES(threading) - for( ol=0; olnum_restart && finish==0; ol++ ) { + for( ol=0; olnum_restart && finish==0; ol++ ) { if( ol == 0 && p->initial_guess_zero ) { res = _NO_RES; //vector_PRECISION_copy( &(p->r), &(p->b), start, end, l ); @@ -343,8 +342,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread H_tot=0; for( i=0; iH[j][(j+1)*n_vect+i] ); - - //if ( cabs( p->H[j][j+1] ) > p->tol/10 ) { + //if ( cabs( p->H[j][j+1] ) > p->tol/10 ) if ( H_tot > n_vect*p->tol/10 ) { qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading ); //gamma_jp1 = cabs( p->gamma[(j+1)] ); @@ -365,7 +363,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread for( i=0; itol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop + //if( gamma_jp1/norm_r0 < p->tol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop if( gamma_tot < n_vect*p->tol || gamma_tot > n_vect*1E+5 ) { finish = 1; START_MASTER(threading) @@ -458,8 +456,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread START_MASTER(threading) if ( g.method != 6 ) prof_print( l ); END_MASTER(threading) - } - //} + } return iter; } @@ -1152,6 +1149,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) vector_PRECISION_saxpy_new( w, w, &V[i], H[j], i, -1, l, threading ); + #ifdef REORTH // re-orthogonalization process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); @@ -1182,6 +1180,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector #pragma vector aligned for( n_vec=0; n_vecdepth, diff1[n_vect-1]/diff2[n_vect-1] ); + for(int i=0; idepth, diff1[i]/diff2[i] ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { From bf4d293ac0e49128adee7a45a5cd67602165983e Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Sun, 25 Nov 2018 19:27:42 +0200 Subject: [PATCH 26/31] Added unroll pragma --- src/dirac_generic.c | 39 ++- src/dirac_generic.h | 585 ++++++++++++++++++++----------------- src/init_generic.c | 24 +- src/linalg.c | 48 +-- src/linalg_generic.c | 124 ++++---- src/linsolve.c | 154 ++++++---- src/linsolve_generic.c | 196 ++++++++----- src/main.c | 2 +- src/main.h | 3 + src/main_pre_def_generic.h | 2 +- src/top_level.c | 11 +- src/vector_generic.c | 61 +++- 12 files changed, 753 insertions(+), 496 deletions(-) diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 0e5f1e1..67f66a0 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -228,12 +228,11 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operato while ( leta < leta_end ) for( i=0; i<12; i++ ){ for( j=0; jglobal_lattice, sl[4]; + double phase[4]; + complex_double twisted_bc; + for (i=0; i<4; i++) + sl[i] = l->local_lattice[i]*g.my_coords[i]; + + for (t=0; tlocal_lattice[0]; t++) { + phase[T] = theta[T]*((double)sl[T]+t)/(double)gl[T]; + for (z=0; zlocal_lattice[1]; z++) { + phase[Z] = phase[T] + theta[Z]*((double)sl[Z]+z)/(double)gl[Z]; + for (y=0; ylocal_lattice[2]; y++) { + phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y]; + for (x=0; xlocal_lattice[3]; x++) { + phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X]; + twisted_bc = exp(I*phase[X]); +/*#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + FOR24( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ); + } else +#endif*/ + for (i=0; i<12; i++){ + for(j=0; jvector_buffer = (*phi->vector_buffer)*twisted_bc; + phi->vector_buffer++; + eta->vector_buffer++; + } + } + } + } + } + } +} + void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { diff --git a/src/dirac_generic.h b/src/dirac_generic.h index a522cb8..3b7aa15 100644 --- a/src/dirac_generic.h +++ b/src/dirac_generic.h @@ -39,7 +39,10 @@ void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ); void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ); void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); + void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l); + void apply_twisted_bc_to_vector_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l); + void operator_updates_PRECISION( level_struct *l, struct Thread *threading ); void m0_update_PRECISION( PRECISION m0,operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op, @@ -111,17 +114,19 @@ static inline void mvm_PRECISION_new( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { int n_vect = g.num_rhs_vect; - for( int i=0; ivbuf_PRECISION[i]), _ORDINARY, 2, l, no_threading ); + vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, 2*g.num_rhs_vect, l, no_threading ); } - vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 2, l, no_threading ); - vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 2, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 2*g.num_rhs_vect, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); #else for ( int i=0; ivbuf_PRECISION[i]), _ORDINARY, 1, l, no_threading ); + vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, g.num_rhs_vect, l, no_threading ); } - vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 1, l, no_threading ); - vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 1, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, g.num_rhs_vect, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, g.num_rhs_vect, l, no_threading ); #endif } @@ -151,11 +151,11 @@ void next_level_PRECISION_setup( level_struct *l ) { vector_PRECISION_init(&(l->next_level->p_PRECISION.b)); vector_PRECISION_init(&(l->next_level->p_PRECISION.x)); #ifdef HAVE_TM1p1 - vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 2, l->next_level, no_threading ); - vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 2, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading ); #else - vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 1, l->next_level, no_threading ); - vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 1, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading ); #endif l->next_level->p_PRECISION.v_start = 0; l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size; @@ -165,9 +165,9 @@ void next_level_PRECISION_setup( level_struct *l ) { int i, n = (l->next_level->level>0)?6:4; for ( i=0; inext_level->vbuf_PRECISION[i]), _ORDINARY, 2, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading ); #else - vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 1, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading ); #endif } } diff --git a/src/linalg.c b/src/linalg.c index e95a937..402060f 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -71,16 +71,22 @@ void process_multi_inner_product_MP_new( int count, complex_double *results, vec if(thread == 0 && start != end) PROF_float_START( _PIP, threading ); - int i, j; + int i, j, k; for(int c=0; cnum_vect; j++) - results[c*psi->num_vect+j] = 0.0; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( k=0; knum_vect+j+k] = 0.0; for(int c=0; cnum_vect; j++) - results[c*psi->num_vect+j] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + for( k=0; knum_vect+j+k] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j+k])*psi->vector_buffer[i*psi->num_vect+j+k]; if(thread == 0 && start != end) PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); @@ -143,18 +149,26 @@ void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct T if(thread == 0 && start != end) PROF_float_START( _GIP, threading ); - int i, j; - for( j=0; jnum_vect; j++){ - res[j]=0; - } + int i, j, k; + for( j=0; jnum_vect; j+=num_loop ) + #pragma unroll + #pragma vector aligned + for( k=0; knum_vect; j++){ - res[j] += NORM_SQUARE_float(x->vector_buffer[i*x->num_vect+j]); - } - for( j=0; jnum_vect; j++){ - res[j] = (double)sqrt((double)res[j]); - } + for( i=start; inum_vect; j+=num_loop ) + #pragma unroll + #pragma vector aligned + for( k=0; kvector_buffer[i*x->num_vect+j+k]); + + for( j=0; jnum_vect; j+=num_loop ) + #pragma unroll + #pragma vector aligned + for( k=0; kinner_vector_size, threading ); } diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 22cd143..3ef28c2 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -164,15 +164,20 @@ void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *re if(thread == 0 && start != end) PROF_PRECISION_START( _PIP, threading ); - int i, j; - for(int c=0; cnum_vect; c++) - results[c] = 0.0; + int i, j, k; + for(int c=0; cnum_vect; c+=num_loop) + #pragma unroll + #pragma vector aligned + for( k=0; knum_vect; j++) - results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j]; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + for( k=0; knum_vect+j+k] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+k])*psi->vector_buffer[i*psi->num_vect+j+k]; if(thread == 0 && start != end) PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); @@ -279,19 +284,23 @@ void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struc if(thread == 0 && start != end) PROF_PRECISION_START( _GIP, threading ); - int i, j; - for( j=0; jnum_vect; j++){ - res[j]=0; - } + int i, j, k; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + for( k=0; knum_vect; j++){ - res[j] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j]); - } + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + for( k=0; kvector_buffer[i*x->num_vect+j+k]); - for( j=0; jnum_vect; j++){ - res[j] = (PRECISION)sqrt((double)res[j]); - } + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + for( k=0; kinner_vector_size, threading ); @@ -313,17 +322,19 @@ void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRE void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { - int i, j, start, end; + int i, j, k, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); for( i=start; inum_vect; j++){ - z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + y->vector_buffer[i*x->num_vect+j]; - } + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] + y->vector_buffer[i*x->num_vect+j+k]; if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -345,17 +356,19 @@ void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PR void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { - int i, j, start, end; + int i, j, k, start, end; compute_core_start_end(0, y->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); for( i=start; inum_vect; j++){ - z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - y->vector_buffer[i*x->num_vect+j]; - } + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] - y->vector_buffer[i*x->num_vect+j+k]; if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -376,17 +389,19 @@ void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_P void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ) { - int i, j, start, end; + int i, j, n, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); for( i=start; inum_vect; j++){ - z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j]; - } + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( n=0; nvector_buffer[i*x->num_vect+j+n] = alpha[k*x->num_vect+j+n]*x->vector_buffer[i*x->num_vect+j+n]; if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); @@ -438,7 +453,7 @@ void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PR void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ) { - int i, j, start, end; + int i, j, n, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if (thread == 0 && start != end ) @@ -446,14 +461,20 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto if( sign == 1 ) for( i=start; inum_vect; j++) - z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( n=0; nvector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] + alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n]; else for( i=start; inum_vect; j++) - z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j]; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( n=0; nvector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] - alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n]; if( thread == 0 && start != end ) @@ -487,25 +508,28 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, com void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, int count, level_struct *l, struct Thread *threading ) { - int i, j, start, end; + int c, i, j, k, start, end; compute_core_start_end(0, z->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if (thread == 0 && start != end ) PROF_PRECISION_START( _LA8 ); complex_PRECISION alpha_signed[count*z->num_vect]; - for ( int c=0; cnum_vect; j++) - alpha_signed[c*z->num_vect+j] = sign*alpha[c*z->num_vect+j]; - } - - for ( int c=0; cnum_vect; j+=num_loop) + #pragma unroll #pragma vector aligned - for( j=0; jnum_vect; j++) - z->vector_buffer[i*z->num_vect+j] += V[c].vector_buffer[i*z->num_vect+j]*alpha_signed[c]; - } - } + for( k=0; knum_vect+j+k] = sign*alpha[c*z->num_vect+j+k]; + + for ( c=0; cnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( k=0; kvector_buffer[i*z->num_vect+j+k] += V[c].vector_buffer[i*z->num_vect+j+k]*alpha_signed[c]; if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); diff --git a/src/linsolve.c b/src/linsolve.c index ff36e79..2d91e66 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -163,7 +163,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { int start; int end; - int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, n_vec; + int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, k;//n_vec; complex_double gamma0[n_vect];//gamma0=0; double beta[n_vect]; //beta=0; @@ -171,9 +171,16 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { double norm_r0[n_vect], gamma_jp1[n_vect], gamma0_real[n_vect], gamma_tot, H_tot, gamma_tot2;//norm_r0=1, gamma_jp1=1 complex_float gamma_float[n_vect]; - for( i=0; idp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) global_norm_double_new( gamma0_real, &(p->dp.r), l, threading ); - for( i=0; idp.gamma[0] = gamma0; - #pragma vector aligned - for( i=0; idp.gamma[i] = gamma0[i]; + for( i=0; idp.gamma[i+k] = gamma0[i+k]; END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) @@ -223,8 +234,10 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { printf0("| initial guess relative residual (%d): %le |\n", i, creal(gamma0[i])/norm_r0[i]); } else { //norm_r0 = creal(gamma0); - for( i=0; isp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); //vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 - for( i=0; idp.gamma[0*n_vect+i]; + for( i=0; idp.gamma[0*n_vect+i+k]; vector_float_real_scale_new( &(p->sp.V[0]), &(p->sp.V[0]), gamma_float, 0, 1, l, threading ); // inner loop in single precision for( il=0; ildp.restart_length && finish==0; il++) { j = il; iter++; arnoldi_step_MP_new( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); H_tot=0; - for( i=0; idp.H[j][(j+1)*n_vect+i] ); + for( i=0; idp.H[j][(j+1)*n_vect+i+k] ); //if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) if ( H_tot > n_vect*1E-15 ) { qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading ); //gamma_jp1 = cabs( p->dp.gamma[j+1] ); - #pragma vector aligned - for( i=0; idp.gamma[(j+1)*n_vect+i] ); + for( i=0; idp.gamma[(j+1)*n_vect+i+k] ); if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) { #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -269,8 +290,10 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { #endif } gamma_tot=0; - for( i=0; idp.tol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop if( gamma_tot < n_vect*p->dp.tol || gamma_tot > n_vect*1E+5 ) { @@ -280,8 +303,10 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { END_MASTER(threading) } gamma_tot2=0; - for( i=0; isp.tol ) if( gamma_tot2 < n_vect*p->sp.tol ){ break; @@ -315,8 +340,10 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { //beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); global_norm_double_new( beta, &(p->dp.r), l, threading ); #else - for( i=0; i 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -435,7 +462,7 @@ void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w, SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) - int i, n_vect=g.num_rhs_vect, n_vec; + int i, n_vect=g.num_rhs_vect, n, k; double H_tot; complex_float H_float[n_vect]; // start and end indices for vector functions depending on thread @@ -466,9 +493,11 @@ void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w, process_multi_inner_product_MP_new( j+1, tmp, V, w, l, threading ); START_MASTER(threading) for( i=0; i<=j; i++ ) - #pragma vector aligned - for( n_vec=0; n_vec 1 ) { PROF_double_START( _ALLR ); @@ -476,17 +505,22 @@ void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w, PROF_double_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) - #pragma vector aligned - for( n_vec=0; n_vec n_vect*1e-15 ){ - for( n_vec=0; n_vec=0; i-- ) { - for ( n=0; ndepth==0 && ( p->timing || p->print ) ) prof_init( l ); @@ -281,14 +283,19 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } //gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) global_norm_PRECISION_new( gamma0_real, &(p->r), l, threading ); - for( i=0; igamma[0] = gamma0; - #pragma vector aligned - for( i=0; igamma[i] = gamma0[i]; + for( i=0; igamma[i+k] = gamma0[i+k]; + END_MASTER(threading); SYNC_MASTER_TO_ALL(threading); @@ -300,8 +307,10 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread printf0("| initial guess relative residual (%d): %le |\n", i, creal(gamma0[i])/norm_r0[i]); } else { //norm_r0 = creal(p->gamma[0]); - for( i=0; igamma[i]); + for( i=0; igamma[i+k]); } } //vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 @@ -340,15 +349,21 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } #endif H_tot=0; - for( i=0; iH[j][(j+1)*n_vect+i] ); + for( i=0; iH[j][(j+1)*n_vect+i+k] ); + //if ( cabs( p->H[j][j+1] ) > p->tol/10 ) if ( H_tot > n_vect*p->tol/10 ) { qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading ); //gamma_jp1 = cabs( p->gamma[(j+1)] ); - #pragma vector aligned - for( i=0; igamma[(j+1)*n_vect+i] ); + for( i=0; igamma[(j+1)*n_vect+i+k] ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%10 == 0 || p->preconditioner != NULL || l->depth > 0 ) { @@ -360,8 +375,10 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } #endif gamma_tot=0; - for( i=0; itol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop if( gamma_tot < n_vect*p->tol || gamma_tot > n_vect*1E+5 ) { @@ -395,24 +412,26 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread //beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); global_norm_PRECISION_new( beta, &(p->r), l, threading ); #else - for( i=0; i 0 ) printf0("+----------------------------------------------------------+\n\n"); #endif printf0("+----------------------------------------------------------+\n"); printf0("| FGMRES iterations: %-6d coarse average: %-6.2lf |\n", iter, ((double)g.coarse_iter_count)/((double)iter) ); - for( i=0; i 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -1098,7 +1117,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector #else SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) - int i, n_vect=g.num_rhs_vect, n_vec; + int i, n_vect=g.num_rhs_vect, n, k; PRECISION H_tot; // start and end indices for vector functions depending on thread int start, end; @@ -1132,18 +1151,23 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); START_MASTER(threading) for( i=0; i<=j; i++ ) - #pragma vector aligned - for( n_vec=0; n_vec 1 ) { PROF_PRECISION_START( _ALLR ); MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); PROF_PRECISION_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) - #pragma vector aligned - for( n_vec=0; n_vec 1 ) { PROF_PRECISION_START( _ALLR ); MPI_Allreduce( buffer, tmp, (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); @@ -1164,8 +1191,10 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector } for( i=0; i<=j; i++ ) - for( n_vec=0; n_vecH[j][(j+1)*n_vect+i] ); + for( i=0; iH[j][(j+1)*n_vect+i+k] ); if ( H_tot > n_vect*1e-15 ) vector_PRECISION_real_scale_new( &V[j+1], w, H[j], j+1, 1, l, threading ); #endif @@ -1215,40 +1249,54 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, PROF_PRECISION_START( _SMALL1 ); - int i, n, n_vect=g.num_rhs_vect; + int i, n, k, n_vect=g.num_rhs_vect; complex_PRECISION beta[n_vect]; // update QR factorization // apply previous Givens rotation - for ( i=0; i=0; i-- ) { - for ( n=0; nstart_index[l->depth], end = threading->end_index[l->depth]; - vector_double rhs = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.b:g.p.b; vector_double sol = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.x:g.p.x; @@ -98,6 +97,7 @@ int wilson_driver( vector_double *solution, vector_double *source, level_struct } //vector_double_copy( solution, &sol, start, end, l ); vector_double_copy_new( solution, &sol, l, threading ); + #ifdef WILSON_BENCHMARK tmp_t += MPI_Wtime(); if ( tmp_t < t_min ) @@ -124,7 +124,8 @@ void solve( vector_double *solution, vector_double *source, level_struct *l, str vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b; // this would yield different results if we threaded it, so we don't START_LOCKED_MASTER(threading) - vector_double_define_random( &rhs, 0, l->inner_vector_size, l ); + //vector_double_define_random( &rhs, 0, l->inner_vector_size, l ); + vector_double_define_random_new( &rhs, l, threading ); scan_var( &(g.vt), l ); END_LOCKED_MASTER(threading) } else { @@ -160,7 +161,7 @@ void solve_driver( level_struct *l, struct Thread *threading ) { vector_double_change_layout( &source, &source, _LV_SV_NV, no_threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( &source, &source, g.twisted_bc, l); + apply_twisted_bc_to_vector_double_new( &source, &source, g.twisted_bc, l); global_norm_double_new( norm, &source, l, threading ); for( int i=0; ivector_buffer, r_alpha = creal_PRECISION(alpha); + PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, r_alpha = creal_PRECISION(alpha); #else - PRECISION *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); + PRECISION *r_z = (PRECISION*)z->vector_buffer, r_alpha = creal_PRECISION(alpha); #endif int r_start = 2*start, r_end = 2*end; @@ -134,7 +134,7 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, comp #pragma vector aligned #endif for(int j=0; jvector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); @@ -167,27 +167,50 @@ void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, //vector_PRECISION_check_comp( z, x ); - int i, j, start, end; + int i, j, k, start, end; PRECISION r_alpha[x->num_vect]; if(opt){ - for( j=0; jnum_vect; j++) - r_alpha[j]=1.0/creal_PRECISION(alpha[n*x->num_vect+j]); + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + for( k=0; knum_vect+j+k]); }else{ - for( j=0; jnum_vect; j++) - r_alpha[j]=creal_PRECISION(alpha[n*x->num_vect+j]); + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + for( k=0; knum_vect+j+k]); } compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _RS ); + //PRECISION *restrict r_z = (PRECISION*)z->vector_buffer, *restrict r_x = (PRECISION*)x->vector_buffer; + + //for( i=start; inum_vect; j++) + // z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j]; + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); - for( i=start; inum_vect; j++) - z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j]; - + if(z == x){ + for( i=start; inum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + for( k=0; kvector_buffer[i*x->num_vect+j+k] = r_alpha[j+k]*z->vector_buffer[i*x->num_vect+j+k]; + } else { + // PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer; + for( i=start; inum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( k=0; kvector_buffer[i*x->num_vect+j+k] = r_alpha[j+k]*x->vector_buffer[i*x->num_vect+j+k]; + } //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); @@ -229,7 +252,8 @@ void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_ if(z == x) return; - int i, j, start, end; + int i, j, k, start, end; + //PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0) @@ -237,9 +261,16 @@ void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_ //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + //for( i=start; inum_vect; j++) + // z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j]; for( i=start; inum_vect; j++) - z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j]; + for( j=0; jnum_vect; j+=num_loop) + #pragma unroll + #pragma vector aligned + #pragma ivdep + for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k]; //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); From efc9f4aebf6b9a822f9ce78d8c2fc52f0836a409 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 3 Dec 2018 11:47:02 +0200 Subject: [PATCH 27/31] Fixed vectorization (TODO: remove SSE files) --- src/dirac_generic.c | 54 ++++++++++++++++++++++++++---------------- src/dirac_generic.h | 40 ++++++++++++++++++++++++++++++- src/linsolve.c | 1 + src/linsolve_generic.c | 1 + src/main.h | 2 +- src/vector_generic.c | 2 +- 6 files changed, 76 insertions(+), 24 deletions(-) diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 67f66a0..57f54e1 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -170,7 +170,7 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ) { - int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j; + int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j, k; buffer_PRECISION lphi = phi->vector_buffer+start*n_vect, leta = eta->vector_buffer+start*n_vect; buffer_PRECISION leta_end = eta->vector_buffer+end*n_vect; #ifdef PROFILING @@ -214,25 +214,33 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operato #ifdef HAVE_TM if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { while ( leta < leta_end ) - for( i=0; i<12; i++ ){ - for( j=0; j=0; i-- ) { for ( n=0; n=0; i-- ) { for ( n=0; nvector_buffer[i*x->num_vect+j+k] = r_alpha[j+k]*z->vector_buffer[i*x->num_vect+j+k]; + z->vector_buffer[i*x->num_vect+j+k] *= r_alpha[j+k]; //*z->vector_buffer[i*x->num_vect+j+k]; } else { // PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer; for( i=start; i Date: Tue, 4 Dec 2018 17:14:01 +0200 Subject: [PATCH 28/31] Deleted all sse and vectorized files and functions --- src/clifford.h | 324 +--------------------- src/coarse_oddeven_generic.c | 495 ---------------------------------- src/coarse_oddeven_generic.h | 6 - src/coarse_operator_generic.c | 178 +----------- src/coarse_operator_generic.h | 3 - src/dirac_generic.c | 130 +-------- src/ghost_generic.c | 31 --- src/ghost_generic.h | 4 - src/init.c | 19 -- src/interpolation_generic.c | 3 - src/linalg.c | 2 - src/linalg_generic.c | 21 -- src/main.h | 35 +-- src/main_pre_def_generic.h | 18 +- src/oddeven_generic.c | 297 +------------------- src/operator_generic.c | 64 ----- src/schwarz_generic.c | 108 -------- src/schwarz_generic.h | 18 -- src/setup_generic.c | 32 +-- src/threading.c | 4 - src/vector_generic.c | 71 +---- 21 files changed, 23 insertions(+), 1840 deletions(-) diff --git a/src/clifford.h b/src/clifford.h index 6521566..9307579 100644 --- a/src/clifford.h +++ b/src/clifford.h @@ -99,64 +99,6 @@ #define GAMMA_X_SPIN2_VAL I #define GAMMA_X_SPIN3_CO 1 #define GAMMA_X_SPIN3_VAL -I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN -1 - #define GAMMA_T_SPIN1_RE_SIGN -1 - #define GAMMA_T_SPIN2_RE_SIGN -1 - #define GAMMA_T_SPIN3_RE_SIGN -1 - #define GAMMA_T_SPIN0_IM_SIGN -1 - #define GAMMA_T_SPIN1_IM_SIGN -1 - #define GAMMA_T_SPIN2_IM_SIGN -1 - #define GAMMA_T_SPIN3_IM_SIGN -1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN +1 - #define GAMMA_Z_SPIN1_RE_SIGN +1 - #define GAMMA_Z_SPIN2_RE_SIGN -1 - #define GAMMA_Z_SPIN3_RE_SIGN -1 - #define GAMMA_Z_SPIN0_IM_SIGN -1 - #define GAMMA_Z_SPIN1_IM_SIGN -1 - #define GAMMA_Z_SPIN2_IM_SIGN +1 - #define GAMMA_Z_SPIN3_IM_SIGN +1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN +1 - #define GAMMA_X_SPIN1_RE_SIGN -1 - #define GAMMA_X_SPIN2_RE_SIGN -1 - #define GAMMA_X_SPIN3_RE_SIGN +1 - #define GAMMA_X_SPIN0_IM_SIGN -1 - #define GAMMA_X_SPIN1_IM_SIGN +1 - #define GAMMA_X_SPIN2_IM_SIGN +1 - #define GAMMA_X_SPIN3_IM_SIGN -1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif /* ------------------------------------------------- */ #else @@ -224,64 +166,6 @@ #define GAMMA_X_SPIN2_VAL I #define GAMMA_X_SPIN3_CO 0 #define GAMMA_X_SPIN3_VAL I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN -1 - #define GAMMA_T_SPIN1_RE_SIGN -1 - #define GAMMA_T_SPIN2_RE_SIGN -1 - #define GAMMA_T_SPIN3_RE_SIGN -1 - #define GAMMA_T_SPIN0_IM_SIGN -1 - #define GAMMA_T_SPIN1_IM_SIGN -1 - #define GAMMA_T_SPIN2_IM_SIGN -1 - #define GAMMA_T_SPIN3_IM_SIGN -1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN +1 - #define GAMMA_Z_SPIN1_RE_SIGN -1 - #define GAMMA_Z_SPIN2_RE_SIGN -1 - #define GAMMA_Z_SPIN3_RE_SIGN +1 - #define GAMMA_Z_SPIN0_IM_SIGN -1 - #define GAMMA_Z_SPIN1_IM_SIGN +1 - #define GAMMA_Z_SPIN2_IM_SIGN +1 - #define GAMMA_Z_SPIN3_IM_SIGN -1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN +1 - #define GAMMA_Y_SPIN1_RE_SIGN -1 - #define GAMMA_Y_SPIN2_RE_SIGN -1 - #define GAMMA_Y_SPIN3_RE_SIGN +1 - #define GAMMA_Y_SPIN0_IM_SIGN +1 - #define GAMMA_Y_SPIN1_IM_SIGN -1 - #define GAMMA_Y_SPIN2_IM_SIGN -1 - #define GAMMA_Y_SPIN3_IM_SIGN +1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN +1 - #define GAMMA_X_SPIN1_RE_SIGN +1 - #define GAMMA_X_SPIN2_RE_SIGN -1 - #define GAMMA_X_SPIN3_RE_SIGN -1 - #define GAMMA_X_SPIN0_IM_SIGN -1 - #define GAMMA_X_SPIN1_IM_SIGN -1 - #define GAMMA_X_SPIN2_IM_SIGN +1 - #define GAMMA_X_SPIN3_IM_SIGN +1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif /* ------------------------------------------------- */ #else #ifdef BASIS2 @@ -346,64 +230,6 @@ #define GAMMA_X_SPIN2_VAL -I #define GAMMA_X_SPIN3_CO 0 #define GAMMA_X_SPIN3_VAL -I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN +1 - #define GAMMA_T_SPIN1_RE_SIGN +1 - #define GAMMA_T_SPIN2_RE_SIGN +1 - #define GAMMA_T_SPIN3_RE_SIGN +1 - #define GAMMA_T_SPIN0_IM_SIGN +1 - #define GAMMA_T_SPIN1_IM_SIGN +1 - #define GAMMA_T_SPIN2_IM_SIGN +1 - #define GAMMA_T_SPIN3_IM_SIGN +1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN -1 - #define GAMMA_Z_SPIN1_RE_SIGN +1 - #define GAMMA_Z_SPIN2_RE_SIGN +1 - #define GAMMA_Z_SPIN3_RE_SIGN -1 - #define GAMMA_Z_SPIN0_IM_SIGN +1 - #define GAMMA_Z_SPIN1_IM_SIGN -1 - #define GAMMA_Z_SPIN2_IM_SIGN -1 - #define GAMMA_Z_SPIN3_IM_SIGN +1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN -1 - #define GAMMA_X_SPIN1_RE_SIGN -1 - #define GAMMA_X_SPIN2_RE_SIGN +1 - #define GAMMA_X_SPIN3_RE_SIGN +1 - #define GAMMA_X_SPIN0_IM_SIGN +1 - #define GAMMA_X_SPIN1_IM_SIGN +1 - #define GAMMA_X_SPIN2_IM_SIGN -1 - #define GAMMA_X_SPIN3_IM_SIGN -1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif #else #ifdef BASIS3 // Basis used in the QOPQDP Code (by James Osborn/USQCD) @@ -467,64 +293,6 @@ #define GAMMA_X_SPIN2_VAL -I #define GAMMA_X_SPIN3_CO 1 #define GAMMA_X_SPIN3_VAL I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN +1 - #define GAMMA_T_SPIN1_RE_SIGN +1 - #define GAMMA_T_SPIN2_RE_SIGN +1 - #define GAMMA_T_SPIN3_RE_SIGN +1 - #define GAMMA_T_SPIN0_IM_SIGN +1 - #define GAMMA_T_SPIN1_IM_SIGN +1 - #define GAMMA_T_SPIN2_IM_SIGN +1 - #define GAMMA_T_SPIN3_IM_SIGN +1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN -1 - #define GAMMA_Z_SPIN1_RE_SIGN -1 - #define GAMMA_Z_SPIN2_RE_SIGN +1 - #define GAMMA_Z_SPIN3_RE_SIGN +1 - #define GAMMA_Z_SPIN0_IM_SIGN +1 - #define GAMMA_Z_SPIN1_IM_SIGN +1 - #define GAMMA_Z_SPIN2_IM_SIGN -1 - #define GAMMA_Z_SPIN3_IM_SIGN -1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN -1 - #define GAMMA_X_SPIN1_RE_SIGN +1 - #define GAMMA_X_SPIN2_RE_SIGN +1 - #define GAMMA_X_SPIN3_RE_SIGN -1 - #define GAMMA_X_SPIN0_IM_SIGN +1 - #define GAMMA_X_SPIN1_IM_SIGN -1 - #define GAMMA_X_SPIN2_IM_SIGN -1 - #define GAMMA_X_SPIN3_IM_SIGN +1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif #else #ifdef BASIS4 // tmLQCD BASIS with an addition change of sign in gamma5 @@ -589,100 +357,10 @@ #define GAMMA_X_SPIN2_VAL -I #define GAMMA_X_SPIN3_CO 0 #define GAMMA_X_SPIN3_VAL -I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN -1 - #define GAMMA_T_SPIN1_RE_SIGN -1 - #define GAMMA_T_SPIN2_RE_SIGN -1 - #define GAMMA_T_SPIN3_RE_SIGN -1 - #define GAMMA_T_SPIN0_IM_SIGN -1 - #define GAMMA_T_SPIN1_IM_SIGN -1 - #define GAMMA_T_SPIN2_IM_SIGN -1 - #define GAMMA_T_SPIN3_IM_SIGN -1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN +1 - #define GAMMA_Z_SPIN1_RE_SIGN -1 - #define GAMMA_Z_SPIN2_RE_SIGN -1 - #define GAMMA_Z_SPIN3_RE_SIGN +1 - #define GAMMA_Z_SPIN0_IM_SIGN -1 - #define GAMMA_Z_SPIN1_IM_SIGN +1 - #define GAMMA_Z_SPIN2_IM_SIGN +1 - #define GAMMA_Z_SPIN3_IM_SIGN -1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN -1 - #define GAMMA_X_SPIN1_RE_SIGN -1 - #define GAMMA_X_SPIN2_RE_SIGN +1 - #define GAMMA_X_SPIN3_RE_SIGN +1 - #define GAMMA_X_SPIN0_IM_SIGN +1 - #define GAMMA_X_SPIN1_IM_SIGN +1 - #define GAMMA_X_SPIN2_IM_SIGN -1 - #define GAMMA_X_SPIN3_IM_SIGN -1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif /* ------------------------------------------------- */ + /* ------------------------------------------------- */ #endif #endif #endif #endif #endif - -#ifdef SSE -static const int gamma_co[4][4] = { - {GAMMA_T_SPIN0_CO, GAMMA_T_SPIN1_CO, GAMMA_T_SPIN2_CO, GAMMA_T_SPIN3_CO}, - {GAMMA_Z_SPIN0_CO, GAMMA_Z_SPIN1_CO, GAMMA_Z_SPIN2_CO, GAMMA_Z_SPIN3_CO}, - {GAMMA_Y_SPIN0_CO, GAMMA_Y_SPIN1_CO, GAMMA_Y_SPIN2_CO, GAMMA_Y_SPIN3_CO}, - {GAMMA_X_SPIN0_CO, GAMMA_X_SPIN1_CO, GAMMA_X_SPIN2_CO, GAMMA_X_SPIN3_CO}}; - -static const double complex gamma_val[4][4] = { - {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL}, - {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL}, - {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL}, - {GAMMA_X_SPIN0_VAL, GAMMA_X_SPIN1_VAL, GAMMA_X_SPIN2_VAL, GAMMA_X_SPIN3_VAL}}; - -static const int gamma_offset[4][4] = { - {GAMMA_T_SPIN0_OFFSET,GAMMA_T_SPIN1_OFFSET,GAMMA_T_SPIN2_OFFSET,GAMMA_T_SPIN3_OFFSET}, - {GAMMA_Z_SPIN0_OFFSET,GAMMA_Z_SPIN1_OFFSET,GAMMA_Z_SPIN2_OFFSET,GAMMA_Z_SPIN3_OFFSET}, - {GAMMA_Y_SPIN0_OFFSET,GAMMA_Y_SPIN1_OFFSET,GAMMA_Y_SPIN2_OFFSET,GAMMA_Y_SPIN3_OFFSET}, - {GAMMA_X_SPIN0_OFFSET,GAMMA_X_SPIN1_OFFSET,GAMMA_X_SPIN2_OFFSET,GAMMA_X_SPIN3_OFFSET}}; - -static const int gamma_re_sign[4][4] = { - {GAMMA_T_SPIN0_RE_SIGN,GAMMA_T_SPIN1_RE_SIGN,GAMMA_T_SPIN2_RE_SIGN,GAMMA_T_SPIN3_RE_SIGN}, - {GAMMA_Z_SPIN0_RE_SIGN,GAMMA_Z_SPIN1_RE_SIGN,GAMMA_Z_SPIN2_RE_SIGN,GAMMA_Z_SPIN3_RE_SIGN}, - {GAMMA_Y_SPIN0_RE_SIGN,GAMMA_Y_SPIN1_RE_SIGN,GAMMA_Y_SPIN2_RE_SIGN,GAMMA_Y_SPIN3_RE_SIGN}, - {GAMMA_X_SPIN0_RE_SIGN,GAMMA_X_SPIN1_RE_SIGN,GAMMA_X_SPIN2_RE_SIGN,GAMMA_X_SPIN3_RE_SIGN}}; - -static const int gamma_im_sign[4][4] = { - {GAMMA_T_SPIN0_IM_SIGN,GAMMA_T_SPIN1_IM_SIGN,GAMMA_T_SPIN2_IM_SIGN,GAMMA_T_SPIN3_IM_SIGN}, - {GAMMA_Z_SPIN0_IM_SIGN,GAMMA_Z_SPIN1_IM_SIGN,GAMMA_Z_SPIN2_IM_SIGN,GAMMA_Z_SPIN3_IM_SIGN}, - {GAMMA_Y_SPIN0_IM_SIGN,GAMMA_Y_SPIN1_IM_SIGN,GAMMA_Y_SPIN2_IM_SIGN,GAMMA_Y_SPIN3_IM_SIGN}, - {GAMMA_X_SPIN0_IM_SIGN,GAMMA_X_SPIN1_IM_SIGN,GAMMA_X_SPIN2_IM_SIGN,GAMMA_X_SPIN3_IM_SIGN}}; -#endif - #endif diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c index af7000b..a5c4e5a 100644 --- a/src/coarse_oddeven_generic.c +++ b/src/coarse_oddeven_generic.c @@ -299,18 +299,13 @@ void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato int start, end; compute_core_start_end_custom( 0, op->num_even_sites, &start, &end, l, threading, 1 ); // even sites -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( y, x, op, start, end, l ); -#else - coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l ); -#endif } void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION vector_PRECISION x_pt, y_pt; int num_site_var=l->num_lattice_site_var, oo_inv_size = SQUARE(num_site_var); @@ -333,10 +328,6 @@ void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operato sc += oo_inv_size; } -#else - compute_core_start_end_custom( op->num_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 ); - coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l ); -#endif } void coarse_diag_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l ) { @@ -356,20 +347,10 @@ void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, ope int num_site_var = l->num_lattice_site_var, oo_inv_size = SQUARE(num_site_var); -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION #ifdef HAVE_TM1p1 config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv; #else config_PRECISION sc = op->clover_oo_inv; -#endif -#else - int lda = SIMD_LENGTH_PRECISION*((num_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - oo_inv_size = 2*num_site_var*lda; -#ifdef HAVE_TM1p1 - OPERATOR_TYPE_PRECISION *sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv_vectorized:op->clover_oo_inv_vectorized; -#else - OPERATOR_TYPE_PRECISION *sc = op->clover_oo_inv_vectorized; -#endif #endif x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start); @@ -377,13 +358,7 @@ void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, ope sc += oo_inv_size*start; for ( int i=start; inum_odd_sites, &start, &end, l, threading, 1); -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - int size = SQUARE(2*nv); for( int i=start; iclover_oo_inv+i*size, op, op->num_even_sites+i, l ); @@ -411,24 +384,6 @@ void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread coarse_selfcoupling_LU_doublet_decomposition_PRECISION( op->clover_doublet_oo_inv+i*size_doublet, op, op->num_even_sites+i, l ); #endif - -#else - - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int size_v = 2*2*nv*column_offset; - for( int i=start; iclover_oo_inv_vectorized + i*size_v, - op->clover_vectorized + (op->num_even_sites+i)*size_v, column_offset ); - -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int size_doublet_v = 2*4*nv*column_doublet_offset; - for( int i=start; iclover_doublet_oo_inv_vectorized + i*size_doublet_v, - op->clover_doublet_vectorized + (op->num_even_sites+i)*size_doublet_v, column_doublet_offset ); -#endif - -#endif } void coarse_oddeven_PRECISION_set_couplings( level_struct *l, struct Thread *threading ) { @@ -481,23 +436,10 @@ void coarse_oddeven_alloc_PRECISION( level_struct *l ) { } } -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - MALLOC( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites ); #ifdef HAVE_TM1p1 MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites ); #endif - -#else - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION ); -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION ); -#endif - -#endif - // define data layout eot = op->index_table; define_eot( eot, N, l ); @@ -607,23 +549,10 @@ void coarse_oddeven_free_PRECISION( level_struct *l ) { operator_PRECISION_struct *op = &(l->oe_op_PRECISION); operator_PRECISION_free( op, _ODDEVEN, l ); - coarse_operator_PRECISION_free_vectorized( op, l ); - -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION FREE( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites ); #ifdef HAVE_TM1p1 FREE( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites ); -#endif - -#else - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites ); -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites ); -#endif - #endif for (int k=0; k<2; k++ ) vector_PRECISION_free( &(op->buffer[k]), l, no_threading ); @@ -774,15 +703,6 @@ void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION -#ifndef COMM_HIDING_COARSEOP - int sign = -1; - coarse_pn_hopping_term_PRECISION_vectorized( out, in, op, amount, l, sign, threading); -#else - coarse_n_hopping_term_PRECISION_vectorized( out, in, op, amount, l, threading ); -#endif - return; -#else START_NO_HYPERTHREADS(threading) int mu, i, index, num_site_var=l->num_lattice_site_var, @@ -916,421 +836,6 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *i END_LOCKED_MASTER(threading) END_NO_HYPERTHREADS(threading) -#endif -} - - -void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - in_pt = *in; - out_pt = *out; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*2*l->num_parent_eig_vect*column_offset; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu^dagger coupling - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; - index++; - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; - D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; - D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -#endif -} - - -void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, - const int amount, level_struct *l, int sign, struct Thread *threading ) { - -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - in_pt = *in; - out_pt = *out; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int link_offset = 2*2*l->num_parent_eig_vect*column_offset; - int *neighbor_fw = op->neighbor_table; - int *neighbor_bw = op->backward_neighbor_table; - - int core_start; - int core_end; - - void (*coarse_hopp)(vector_PRECISION *eta, vector_PRECISION *phi, OPERATOR_TYPE_PRECISION *D, level_struct *l); - if(sign == +1) - coarse_hopp = coarse_hopp_PRECISION_vectorized; - else - coarse_hopp = coarse_n_hopp_PRECISION_vectorized; - - - if ( l->num_processes > 1 && op->c.comm ) { - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // send in -mu direction - ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); - } - END_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // prepare for sending to fw: compute hopping terms into forward boundary buffer - for ( i=core_start; inum_inner_lattice_sites) - continue; - out_pt.vector_buffer = out->vector_buffer + num_site_var*neighbor_fw[5*i+1+mu]; - in_pt.vector_buffer = in->vector_buffer + num_site_var*neighbor_fw[5*i]; - D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - coarse_hopp( &out_pt, &in_pt, D_vectorized, l ); - } - } - START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // send in +mu direction - ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); - } - END_LOCKED_MASTER(threading) - } - else - SYNC_CORES(threading) - - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - for ( i=core_start; ivector_buffer + num_site_var*neighbor_fw[5*i]; - - // U_mu^dagger coupling - for(int mu=0; mu<4; mu++) { - // terms coming from backward boundary buffer are done by the ghost_wait_PRECISION call below - if(neighbor_bw[5*i+1+mu] >= l->num_inner_lattice_sites) - continue; - D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_bw[5*i+1+mu] + mu*link_offset; - in_pt.vector_buffer = in->vector_buffer + num_site_var*neighbor_bw[5*i+1+mu]; - coarse_hopp( &out_pt, &in_pt, D_vectorized, l ); - } - - // compute U_mu couplings - for(int mu=0; mu<4; mu++) { - D_vectorized = op->D_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - in_pt.vector_buffer = in->vector_buffer + num_site_var*neighbor_fw[5*i+1+mu]; - coarse_hopp( &out_pt, &in_pt, D_vectorized, l ); - } - } - - - // wait for terms from bw and add them - if ( l->num_processes > 1 && op->c.comm ) { - START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); - } - END_LOCKED_MASTER(threading) - } - else - SYNC_CORES(threading) - - END_NO_HYPERTHREADS(threading) -#endif -} - - -void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - in_pt = *in; - out_pt = *out; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*2*l->num_parent_eig_vect*column_offset; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // D is applied in an input-centric way - // this makes threading a bit ugly, is there a better way? - // compute U_mu^dagger coupling - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; - index++; - out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ivector_buffer + num_site_var*op->neighbor_table[index]; - D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; - index++; - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( &out_pt, &in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -#endif } diff --git a/src/coarse_oddeven_generic.h b/src/coarse_oddeven_generic.h index 85f74c6..ec33b23 100644 --- a/src/coarse_oddeven_generic.h +++ b/src/coarse_oddeven_generic.h @@ -45,12 +45,6 @@ const int amount, level_struct *l, struct Thread *threading ); void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, - const int amount, level_struct *l, int sign, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); void coarse_odd_even_PRECISION_test( vector_PRECISION *c4, vector_PRECISION *c1, level_struct *l, struct Thread *threading ); diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index 5d8750f..641d9af 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -37,32 +37,6 @@ void coarse_operator_PRECISION_free( level_struct *l ) { operator_PRECISION_free( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level ); - coarse_operator_PRECISION_free_vectorized( &(l->next_level->s_PRECISION.op), l->next_level ); -} - -void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ) { - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - if( op->D_vectorized != NULL ) { - int n2 = (l->depth>0 && l->level>0) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // 2 is for complex, 4 is for 4 directions - FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 ); - FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 ); - } -#endif - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if( op->clover_vectorized != NULL ) { - int n = l->num_inner_lattice_sites; - int column_offset = SIMD_LENGTH_PRECISION*((2*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*2*l->num_parent_eig_vect*column_offset*n ); -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_parent_eig_vect*column_doublet_offset*n ); -#endif - } -#endif } void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { @@ -307,14 +281,9 @@ void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *p vector_PRECISION leta1=leta, leta2=leta, lphi1=lphi, lphi2=lphi; // site-wise self coupling -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( eta, phi, &(s->op), (start/m), (start/m)+n, l); -#else - coarse_self_couplings_PRECISION_vectorized( eta, phi, &(s->op), (start/m), (start/m)+n, l ); -#endif // inner block couplings -#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION int hopp_size = 4 * SQUARE( num_eig_vect*2 ); config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size; @@ -332,29 +301,6 @@ void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *p coarse_daggered_hopp_PRECISION( &leta2, &lphi2, D_pt, l ); } } -#else - int column_offset = 2*SIMD_LENGTH_PRECISION*((num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*2*num_eig_vect*column_offset; - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + - (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + - (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; - ind = index[mu]; // mu direction - for ( int i=0; inum_parent_eig_vect*2); - int n1, n2; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 4*l->num_parent_eig_vect*column_offset; - - if ( l->depth > 0 && l->level>0 ) { - n1 = l->num_lattice_sites; - n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites; - } else { - n1 = l->num_inner_lattice_sites; - n2 = l->num_inner_lattice_sites; - } - int start, end; - compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1); - int n_per_core = end-start; - START_LOCKED_MASTER(threading) - if( op->D_vectorized == NULL ) { - // 2 is for complex, 4 is for 4 directions - MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 ); - MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 ); - } - END_LOCKED_MASTER(threading) - - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - // vectorize negative boundary - if ( n2>n1 ) { - compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1); - n_per_core = end-start; - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - } - SYNC_CORES(threading) -#endif - } void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { -#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - int n = l->num_inner_lattice_sites, nv = l->num_parent_eig_vect; - int sc_size = (nv)*(nv*2+1); - int start, end; - compute_core_start_end_custom(0, n, &start, &end, l, threading, 1); - int n_per_core = end-start; - - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 2*2*nv*column_offset; - if( op->clover_vectorized == NULL ) { - START_LOCKED_MASTER(threading) - MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, offset_v*n, 64 ); - END_LOCKED_MASTER(threading) - } - copy_coarse_operator_clover_to_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_vectorized + start*offset_v, - n_per_core, nv); -#ifdef HAVE_TM - int tm_size = (nv)*(nv+1); - if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) - add_tm_term_to_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_vectorized + start*offset_v, - n_per_core, nv); -#endif - -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_doublet_v = 2*4*nv*column_doublet_offset; - int eps_size = (nv)*(nv+1); - if( op->clover_doublet_vectorized == NULL ) { - START_LOCKED_MASTER(threading) - MALLOC_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, offset_doublet_v*n, 64 ); - END_LOCKED_MASTER(threading) - } - copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_doublet_vectorized + start*offset_doublet_v, - n_per_core, nv); - if ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) - add_epsbar_term_to_doublet_vectorized_layout_PRECISION( - op->epsbar_term + start*eps_size, - op->clover_doublet_vectorized + start*offset_doublet_v, - n_per_core, nv); -#ifdef HAVE_TM - if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) - add_tm_term_to_doublet_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_doublet_vectorized + start*offset_doublet_v, - n_per_core, nv); -#endif -#endif - SYNC_CORES(threading) -#endif } void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { @@ -687,20 +530,12 @@ void apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *p int end; compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &end, l, threading, 1); -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( eta, phi, op, start, end, l); -#else - coarse_self_couplings_PRECISION_vectorized( eta, phi, op, start, end, l ); -#endif PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); -#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading ); -#else - coarse_hopping_term_PRECISION_vectorized( eta, phi, op, _FULL_SYSTEM, l, threading ); -#endif PROF_PRECISION_STOP( _NC, 1, threading ); } @@ -749,20 +584,9 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if(g.n_flavours == 1) #endif { -#ifdef INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION - double norm = 0.0; - double dot = 0.0; - float *op = (float *)l->is_PRECISION.operator; - float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1; - for ( int i=0; iinner_vector_size; i++ ) - norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); - for ( int i=0; iinner_vector_size; i++ ) - dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); - diff = dot/norm; -#else diff = global_inner_product_PRECISION( &(l->is_PRECISION.interpolation[0]), &(l->is_PRECISION.interpolation[1]), 0, ivs, l, no_threading ) / global_norm_PRECISION( &(l->is_PRECISION.interpolation[0]), 0, ivs, l, no_threading ); -#endif + test0_PRECISION("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) ); } diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h index 25215a0..a33c594 100644 --- a/src/coarse_operator_generic.h +++ b/src/coarse_operator_generic.h @@ -22,13 +22,10 @@ #ifndef COARSE_OPERATOR_PRECISION_HEADER #define COARSE_OPERATOR_PRECISION_HEADER - #include "blas_vectorized.h" - struct Thread; void coarse_operator_PRECISION_alloc( level_struct *l ); void coarse_operator_PRECISION_free( level_struct *l ); - void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ); void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ); void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading ); void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 57f54e1..026b171 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -79,9 +79,6 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR #endif } else { - -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - config_PRECISION clover = op->clover+(start/nv)*42; #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -123,23 +120,6 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR #ifdef HAVE_TM1p1 } #endif - -#else - -#ifdef HAVE_TM1p1 - PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized; -#else - PRECISION *clover = op->clover_vectorized; -#endif - clover += start*12; - while ( leta < leta_end ) { // tm_term included in the clover vectorized - sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover ); - leta += nv; lphi += nv; - clover += 12*nv; - } - -#endif - } #ifdef HAVE_TM1p1 @@ -248,8 +228,6 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operato } else { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - config_PRECISION clover = op->clover+(start/nv)*42; /*#ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -301,24 +279,7 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operato } /*#ifdef HAVE_TM1p1 } -#endif - -#else - -#ifdef HAVE_TM1p1 - PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized; -#else - PRECISION *clover = op->clover_vectorized; -#endif - clover += start*12; - while ( leta < leta_end ) { // tm_term included in the clover vectorized - sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover ); - leta += nv; lphi += nv; - clover += 12*nv; - } - */ -#endif - +#endif */ } /* #ifdef HAVE_TM1p1 @@ -391,14 +352,6 @@ void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi // clover term clover_PRECISION(eta, phi, &(s->op), start, start+nv*n, l, no_threading ); -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // block operator vectorized just in the float environment - PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; - for ( int mu=0; mu<4; mu++ ) { - block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); - } -#else int i, j, k, *ind; config_PRECISION D_pt; config_PRECISION D = s->op.D + (start/nv)*36; @@ -524,7 +477,6 @@ void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi } #ifdef HAVE_TM1p1 } -#endif #endif END_UNTHREADED_FUNCTION(threading) } @@ -532,14 +484,9 @@ void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; -#else int i, j, *nb_pt; buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; -#endif compute_core_start_end(0, nv*n, &start, &end, l, threading ); @@ -553,9 +500,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi->vector_buffer, start, end ); -#else complex_PRECISION pbuf[12]; for ( i=start/2, phi_pt=phi->vector_buffer+start; iprnT+i, phi_pt ); @@ -563,7 +507,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper dprp_Y_PRECISION( op->prnY+i, phi_pt ); dprp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -572,9 +515,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); END_LOCKED_MASTER(threading) -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); -#else // project plus dir and multiply with U dagger for ( phi_pt=phi->vector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptprpX+j+6, D_pt, pbuf+6 ); mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9; } -#endif // start communication in positive direction START_LOCKED_MASTER(threading) @@ -621,9 +560,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); END_LOCKED_MASTER(threading) -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_dpbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); -#else // multiply with U and lift up minus dir for ( eta_pt=eta->vector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnX+j+9 ); dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -666,22 +601,15 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta->vector_buffer, prp, start, end ); -#else for ( i=start/2, eta_pt=eta->vector_buffer+start; iprpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif } else { #endif -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi->vector_buffer, start, end ); -#else complex_PRECISION pbuf[6]; for ( i=start/2, phi_pt=phi->vector_buffer+start; iprnT+i, phi_pt ); @@ -689,7 +617,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper prp_Y_PRECISION( op->prnY+i, phi_pt ); prp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -699,9 +626,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); -#else for ( phi_pt=phi->vector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpX+j, D_pt, pbuf ); mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9; } -#endif // start communication in positive direction START_LOCKED_MASTER(threading) @@ -740,9 +663,6 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_pbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); -#else for ( eta_pt=eta->vector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnX+j+3 ); pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -776,16 +695,12 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta->vector_buffer, prp, start, end ); -#else for ( i=start/2, eta_pt=eta->vector_buffer+start; iprpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif #ifdef HAVE_TM1p1 } #endif @@ -801,15 +716,10 @@ void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, oper void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var, n_vect = g.num_rhs_vect; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; -#else int i, j, *nb_pt; buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; //int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; -#endif compute_core_start_end(0, nv*n, &start, &end, l, threading ); @@ -824,9 +734,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, /* #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi->vector_buffer, start, end ); -#else complex_PRECISION pbuf[12]; for ( i=start/2, phi_pt=phi->vector_buffer+start+phi_shift; iprnT+i, phi_pt ); @@ -834,7 +741,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, dprp_Y_PRECISION( op->prnY+i, phi_pt ); dprp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -842,9 +748,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); END_LOCKED_MASTER(threading) -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); -#else // project plus dir and multiply with U dagger for ( phi_pt=phi->vector_buffer+start+phi_shift,c end_pt=phi->vector_buffer+end+phi_shift, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptprpX+j+6, D_pt, pbuf+6 ); mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9; } -#endif // start communication in positive direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); @@ -890,9 +792,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); END_LOCKED_MASTER(threading) -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_dpbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); -#else // multiply with U and lift up minus dir for ( eta_pt=eta->vector_buffer+start+eta_shift, end_pt=eta->vector_buffer+end+eta_shift, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnX+j+9 ); dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -935,22 +833,15 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta->vector_buffer, prp, start, end ); -#else for ( i=start/2, eta_pt=eta->vector_buffer+start+eta_shift; iprpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif } else { #endif */ -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi->vector_buffer, start, end ); -#else complex_PRECISION pbuf[6*n_vect]; for ( i=start*n_vect/2, phi_pt=phi->vector_buffer+start*n_vect; iprnT+i, phi_pt ); @@ -958,7 +849,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, prp_Y_PRECISION_new( op->prnY+i, phi_pt ); prp_X_PRECISION_new( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -968,9 +858,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, start, end ); -#else for ( phi_pt=phi->vector_buffer+start*n_vect, end_pt=phi->vector_buffer+end*n_vect, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpX+j, D_pt, pbuf ); mvmh_PRECISION_new( op->prpX+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; } -#endif // start communication in positive direction START_LOCKED_MASTER(threading) @@ -1009,9 +895,6 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_pbp_PRECISION( eta->vector_buffer, prn, op, neighbor, start, end ); -#else for ( eta_pt=eta->vector_buffer+start*n_vect, end_pt=eta->vector_buffer+end*n_vect, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnX+j+3*n_vect ); pbp_su3_X_PRECISION_new( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -1045,16 +927,12 @@ void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta->vector_buffer, prp, start, end ); -#else for ( i=start*n_vect/2, eta_pt=eta->vector_buffer+start*n_vect; iprpT+i, eta_pt ); pbn_su3_Z_PRECISION_new( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION_new( op->prpY+i, eta_pt ); pbn_su3_X_PRECISION_new( op->prpX+i, eta_pt ); } -#endif /*#ifdef HAVE_TM1p1 } #endif*/ @@ -1670,13 +1548,9 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); - START_LOCKED_MASTER(threading) -#else START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif + conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( !l->next_level->idle && l->next_level->level > 0 ) { diff --git a/src/ghost_generic.c b/src/ghost_generic.c index bbc0062..528688d 100644 --- a/src/ghost_generic.c +++ b/src/ghost_generic.c @@ -51,37 +51,6 @@ void negative_sendrecv_PRECISION( vector_PRECISION *phi, const int mu, comm_PREC } -void negative_sendrecv_PRECISION_vectorized( complex_PRECISION *phi, const int mu, comm_PRECISION_struct *c, - level_struct *l, int count, complex_PRECISION *buffer ) { - // send dir = -1 - if( l->global_splitting[mu] > 1 ) { - - int i, j, num_boundary_sites = c->num_boundary_sites[2*mu+1], boundary_start, - *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var; - - complex_PRECISION *tmp_pt; - complex_PRECISION *buffer_pt; - - boundary_start = l->num_inner_lattice_sites; - for ( i=0; inum_boundary_sites[2*i]; - - buffer_pt = buffer; - - for ( i=0; ineighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) ); - MPI_Isend( buffer, count*n*num_boundary_sites, MPI_COMPLEX_PRECISION, - l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) ); - } -} - - void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { diff --git a/src/ghost_generic.h b/src/ghost_generic.h index 24d14b1..59a583c 100644 --- a/src/ghost_generic.h +++ b/src/ghost_generic.h @@ -23,10 +23,6 @@ #define GHOST_PRECISION_HEADER void negative_sendrecv_PRECISION( vector_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l ); - - // as negative_sendrecv_PRECISION, but for count vectors stored in phi in vector-fused data layout - // buffer must be big enough to hold the surface data for count vectors (in one direction) - void negative_sendrecv_PRECISION_vectorized( complex_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l, int count, complex_PRECISION *buffer ); void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_struct *l ); void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l ); diff --git a/src/init.c b/src/init.c index 7015fa6..614c515 100644 --- a/src/init.c +++ b/src/init.c @@ -1103,13 +1103,6 @@ void validate_parameters( int ls, level_struct *l ) { int i; int mu; -#ifdef SSE - if ( !g.odd_even ) - warning0("The SSE implementation is based on the odd-even preconditioned code.\ - \n Switch on odd-even preconditioning in the input file.\n"); - ASSERT( g.odd_even ); -#endif - if ( g.method == 5 && g.interpolation != 0 ) { warning0("Multigrid with BiCGstab smoothing is not supported.\n Switching to FGMRES preconditioned with BiCGstab (g.interpolation=0).\n"); g.interpolation = 0; @@ -1133,14 +1126,6 @@ void validate_parameters( int ls, level_struct *l ) { ASSERT( DIVIDES( g.block_lattice[i][mu], g.local_lattice[i][mu] ) ); ASSERT( DIVIDES( g.global_lattice[i][mu]/g.global_lattice[i+1][mu], g.local_lattice[i][mu] ) ); ASSERT( DIVIDES( g.block_lattice[i][mu], g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ) ); -#ifdef SSE - if ( g.block_lattice[i][mu] != g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ) - warning0("when using SSE, Schwarz block size and aggregate size have to match.\n"); - ASSERT( g.block_lattice[i][mu] == g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ); - // it works everywhere but we have some problem with the vector size. - // TODO: check all vectora allocated with size l->inner_vector_size - ASSERT( g.num_eig_vect[i] % SIMD_LENGTH_float == 0 ); -#endif } if ( g.odd_even ) { @@ -1179,10 +1164,6 @@ void validate_parameters( int ls, level_struct *l ) { //LIST OF CASES WHICH SHOULD WORK, BUT DO NOT (TODO) -#ifdef SSE - ASSERT( g.mixed_precision ); -#endif - //TODO: Could work without, but you need to fix the setup phase. for ( i=0; inum_eig_vect; @@ -307,4 +305,3 @@ void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_s PROF_PRECISION_STOP( _PR, 1, threading ); } -#endif diff --git a/src/linalg.c b/src/linalg.c index 402060f..d79da45 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -21,7 +21,6 @@ #include "main.h" -#ifndef OPTIMIZED_LINALG_float void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ) { @@ -91,7 +90,6 @@ void process_multi_inner_product_MP_new( int count, complex_double *results, vec if(thread == 0 && start != end) PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } -#endif double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) { diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 3ef28c2..021ac44 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -21,11 +21,6 @@ #include "main.h" -#include "sse_float_intrinsic.h" -#include "sse_linalg.h" -#include "sse_linalg_PRECISION.h" - -#ifndef OPTIMIZED_LINALG_PRECISION complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); @@ -71,7 +66,6 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_ return local_alpha; } } -#endif complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { @@ -103,7 +97,6 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector } -#if !defined( OPTIMIZED_LINALG_PRECISION ) void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { @@ -183,8 +176,6 @@ void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *re PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } -#endif - complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ) { @@ -199,7 +190,6 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PREC return numerator/denominator; } -#ifndef OPTIMIZED_LINALG_PRECISION PRECISION global_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); @@ -246,7 +236,6 @@ PRECISION global_norm_PRECISION( vector_PRECISION *x, int start, int end, level_ return (PRECISION)sqrt((double)local_alpha); } } -#endif PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) { @@ -374,7 +363,6 @@ void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vecto PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); } -#ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); @@ -406,7 +394,6 @@ void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, compl if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); } -#endif void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha, @@ -438,7 +425,6 @@ void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int star //PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } -#ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); @@ -480,9 +466,7 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); } -#endif -#ifndef OPTIMIZED_LINALG_PRECISION void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, int count, int start, int end, level_struct *l ) { @@ -534,7 +518,6 @@ void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); } -#endif void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ) { @@ -721,7 +704,6 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con } -#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION ) void setup_gram_schmidt_PRECISION_compute_dots( complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, int start, int end, level_struct *l, struct Thread *threading) { @@ -764,10 +746,8 @@ void setup_gram_schmidt_PRECISION_compute_dots( END_MASTER(threading) // only master needs the result in this case (it will be distributed later) } -#endif -#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION ) void setup_gram_schmidt_PRECISION_axpys( complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, int start, int end, level_struct *l, struct Thread *threading) { @@ -791,7 +771,6 @@ void setup_gram_schmidt_PRECISION_axpys( } } } -#endif void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION *g5v, diff --git a/src/main.h b/src/main.h index 1d6969c..1629c5e 100644 --- a/src/main.h +++ b/src/main.h @@ -33,11 +33,9 @@ #define MAIN_HEADER #define num_loop 4 - - #define double_SIZE 64 - #define float_SIZE 32 - #define double_LENGTH SIMD_LENGTH/double_SIZE - #define float_LENGTH SIMD_LENGTH/float_SIZE + + // #define vector_loop(k, instructions) _Pragma("unroll") _Pragma("vector aligned") _Pragma("ivdep") for(k=0; k 0 ) { variable = (kind*) memalign( SIMD_LENGTH, sizeof(kind) * (length) ); } \ - if ( variable == NULL && (length) > 0 ) { \ - error0("malloc of \"%s\" failed: no memory allocated (%s:%d), current memory used: %lf GB.\n", \ - #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \ - g.cur_storage += (sizeof(kind) * (length))/(1024.0*1024.0); \ - if ( g.cur_storage > g.max_storage ) g.max_storage = g.cur_storage; }while(0) -#else #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \ printf0("malloc of \"%s\" failed: pointer is not NULL (%s:%d).\n", #variable, __FILE__, __LINE__ ); } \ if ( (length) > 0 ) { variable = (kind*) malloc( sizeof(kind) * (length) ); } \ @@ -109,7 +97,6 @@ #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \ g.cur_storage += (sizeof(kind) * (length))/(1024.0*1024.0); \ if ( g.cur_storage > g.max_storage ) g.max_storage = g.cur_storage; }while(0) -#endif #define FREE( variable, kind, length ) do{ if ( variable != NULL ) { \ free( variable ); variable = NULL; g.cur_storage -= (sizeof(kind) * (length))/(1024.0*1024.0); } else { \ @@ -496,24 +483,8 @@ // functions #include "clifford.h" -#ifdef SSE -#include "vectorization_dirac_float.h" -#include "vectorization_dirac_double.h" -#include "blas_vectorized.h" -#include "sse_blas_vectorized.h" -#include "sse_complex_float_intrinsic.h" -#include "sse_complex_double_intrinsic.h" -#include "sse_coarse_operator_float.h" -#include "sse_coarse_operator_double.h" -#include "sse_linalg_float.h" -#include "sse_linalg_double.h" -#include "sse_interpolation_float.h" -#include "sse_interpolation_double.h" -#else -//no intrinsics #include "interpolation_float.h" #include "interpolation_double.h" -#endif #include "data_float.h" #include "data_double.h" diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index 3e0399e..521e5e8 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -22,19 +22,9 @@ #ifndef MAIN_PRE_DEF_PRECISION_HEADER #define MAIN_PRE_DEF_PRECISION_HEADER - #ifdef AVX - #define SIMD_LENGTH 128 - #elif AVX2 - #define SIMD_LENGTH 256 - #elif AVX512 - #define SIMD_LENGTH 512 - #else - #define SIMD_LENGTH 128 - #endif - typedef PRECISION complex complex_PRECISION; typedef PRECISION complex *config_PRECISION; - typedef PRECISION complex *buffer_PRECISION;// __attribute__ ((aligned (SIMD_LENGTH))); + typedef PRECISION complex *buffer_PRECISION; typedef struct { buffer_PRECISION vector_buffer; @@ -74,10 +64,6 @@ vector_PRECISION *buffer; buffer_PRECISION prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX; comm_PRECISION_struct c; - OPERATOR_TYPE_PRECISION *D_vectorized; - OPERATOR_TYPE_PRECISION *D_transformed_vectorized; - OPERATOR_TYPE_PRECISION *clover_vectorized; - OPERATOR_TYPE_PRECISION *clover_oo_inv_vectorized; #ifdef HAVE_TM double mu, mu_odd_shift, mu_even_shift; config_PRECISION tm_term; @@ -85,8 +71,6 @@ #ifdef HAVE_TM1p1 double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift; config_PRECISION epsbar_term, clover_doublet_oo_inv; - OPERATOR_TYPE_PRECISION *clover_doublet_vectorized; - OPERATOR_TYPE_PRECISION *clover_doublet_oo_inv_vectorized; #endif } operator_PRECISION_struct; diff --git a/src/oddeven_generic.c b/src/oddeven_generic.c index 12429fa..32d98f4 100644 --- a/src/oddeven_generic.c +++ b/src/oddeven_generic.c @@ -425,33 +425,11 @@ void diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECI #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_doublet_vectorized + (start/24)*288; - PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); - PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); - for ( int i=start; iepsbar_term+(start/24)*12; - if ( g.n_flavours == 2 && - ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) ) - apply_doublet_coupling_PRECISION( x, y, epsbar_term, end-start ); -#else LU_multiply_PRECISION( y, x, op->clover_doublet_oo_inv, start, end); -#endif } else { #endif if ( g.csw ) { -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_vectorized + (start/12)*144; - PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); - PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); - for ( int i=start; iclover, start, end); #else LLH_multiply_PRECISION( y, x, op->clover, start, end ); @@ -545,30 +523,12 @@ void diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_P #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { // inverse diagonal blocks applied to the odd sites -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288; - PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); - PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); - for ( int i=start; iclover_doublet_oo_inv, start, end ); -#endif } else { #endif // inverse diagonal blocks applied to the odd sites if ( g.csw ) { -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); - PRECISION *x_pt = (PRECISION*) (x->vector_buffer+start); - PRECISION *y_pt = (PRECISION*) (y->vector_buffer+start); - for ( int i=start; iclover, start, end ); #else LLH_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, start, end ); @@ -640,31 +600,12 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { MALLOC( op->clover, complex_PRECISION, lu_dec_size*n ); Aee = op->clover; Aoo = op->clover + op->num_even_sites*lu_dec_size; - /* TODO: fix the vectorized part -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - MALLOC_HUGEPAGES( op->clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36, 4*SIMD_LENGTH_PRECISION ); - PRECISION *Aee_vectorized = op->clover_vectorized; - PRECISION *Aoo_vectorized = op->clover_vectorized + op->num_even_sites*2*2*36; -#endif - */ for ( t=0; tclover_doublet_oo_inv, complex_PRECISION, lu_doublet_dec_size*n ); Aee = op->clover_doublet_oo_inv; Aoo = op->clover_doublet_oo_inv + op->num_even_sites*lu_doublet_dec_size; - /* -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - MALLOC_HUGEPAGES( op->clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36, 4*SIMD_LENGTH_PRECISION ); - MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, op->num_odd_sites*2*2*144, 4*SIMD_LENGTH_PRECISION ); - PRECISION *Aee_vectorized = op->clover_doublet_vectorized; - PRECISION *Aoo_vectorized = op->clover_doublet_vectorized + op->num_even_sites*288; - PRECISION *Aoo_inverse_vectorized = op->clover_doublet_oo_inv_vectorized; -#endif - */ for ( t=0; tD_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); - MALLOC_HUGEPAGES( op->D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); - for ( int i=0; inum_inner_lattice_sites; i++ ) { - PRECISION *D_vectorized = op->D_vectorized + 96*i; - PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i; - complex_PRECISION *D_out_pt = op->D + 36*i; - for ( int mu=0; mu<4; mu++ ) { - set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu ); - } - } -#endif - // define data layout MALLOC( op->index_table, int, N[T]*N[Z]*N[Y]*N[X] ); eot = op->index_table; @@ -930,18 +815,6 @@ void oddeven_free_PRECISION( level_struct *l ) { lu_dec_size = 72; #endif -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, PRECISION, 2*4*l->inner_vector_size ); - FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size ); -#endif -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36 ); -#ifdef HAVE_TM1p1 - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36 ); - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_oo_inv_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*144 ); -#endif -#endif - ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l ); FREE( l->oe_op_PRECISION.D, complex_PRECISION, 4*nc_size*n ); if ( g.csw ) @@ -1102,21 +975,13 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera plus_dir_param = _ODD_SITES; } -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; -#else int i, *nb_pt; buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; -#endif #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // project in negative directions -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi->vector_buffer, 24*start, 24*n ); -#else complex_PRECISION pbuf[12]; for ( i=12*start, phi_pt=phi->vector_buffer+24*start; i<12*n; i+=12, phi_pt+=24 ) { dprp_T_PRECISION( op->prnT+i, phi_pt ); @@ -1124,7 +989,6 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera dprp_Y_PRECISION( op->prnY+i, phi_pt ); dprp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); @@ -1133,9 +997,6 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, 24*start, 24*n ); -#else for ( phi_pt=phi->vector_buffer+24*start, end_pt=phi->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpX+i+6, D_pt, pbuf+6 ); mvmh_PRECISION( op->prpX+i+9, D_pt, pbuf+9 ); D_pt += 9; } -#endif if ( amount == _EVEN_SITES ) { start = start_even, n = end_even; } else if ( amount == _ODD_SITES ) { @@ -1185,9 +1045,6 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_dpbp_PRECISION( eta->vector_buffer, prn, op, neighbor, 24*start, 24*n ); -#else for ( eta_pt=eta->vector_buffer+24*start, end_pt=eta->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnX+i+9 ); dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); @@ -1227,22 +1083,15 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta->vector_buffer, prp, 24*start, 24*n ); -#else for ( i=12*start, eta_pt=eta->vector_buffer+24*start; i<12*n; i+=12, eta_pt+=24 ) { dpbn_su3_T_PRECISION( op->prpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif } else { #endif // project in negative directions -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi->vector_buffer, 12*start, 12*n ); -#else complex_PRECISION pbuf[6]; for ( i=6*start, phi_pt=phi->vector_buffer+12*start; i<6*n; i+=6, phi_pt+=12 ) { prp_T_PRECISION( op->prnT+i, phi_pt ); @@ -1250,7 +1099,6 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera prp_Y_PRECISION( op->prnY+i, phi_pt ); prp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); @@ -1259,9 +1107,6 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi->vector_buffer, op, neighbor, 12*start, 12*n ); -#else for ( phi_pt=phi->vector_buffer+12*start, end_pt=phi->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpX+i, D_pt, pbuf ); mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9; } -#endif if ( amount == _EVEN_SITES ) { start = start_even, n = end_even; } else if ( amount == _ODD_SITES ) { @@ -1303,9 +1147,6 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_pbp_PRECISION( eta->vector_buffer, prn, op, neighbor, 12*start, 12*n ); -#else for ( eta_pt=eta->vector_buffer+12*start, end_pt=eta->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnX+i+3 ); pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); @@ -1337,16 +1177,12 @@ void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, opera ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta->vector_buffer, prp, 12*start, 12*n ); -#else for ( i=6*start, eta_pt=eta->vector_buffer+12*start; i<6*n; i+=6, eta_pt+=12 ) { pbn_su3_T_PRECISION( op->prpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif #ifdef HAVE_TM1p1 } #endif @@ -1575,14 +1411,9 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct } if ( g.csw ) { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION config_PRECISION clover_pt = op->clover, clover_oo_inv_pt = op->clover_oo_inv; complex_double buffer[42]; int cs = 42; -#else - PRECISION *clover_pt = op->clover_vectorized, *clover_oo_inv_pt = op->clover_oo_inv_vectorized; - int cs = 144; -#endif for ( d0=0; d0clover_doublet_oo_inv, clover_pt = op->clover; int cs = g.csw ? 42:12; -#else - PRECISION *clover_pt = g.csw ? op->clover_doublet_vectorized:(PRECISION*)op->clover, *clover_oo_inv_pt = op->clover_doublet_oo_inv_vectorized; - int cs = g.csw ? 288:24; -#endif config_PRECISION eps_term_pt = op->epsbar_term; #ifdef HAVE_TM tm_term_pt = op->tm_term; @@ -1667,8 +1486,6 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) { if (((t-d1*block_size[T])+(z-c1*block_size[Z])+ (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) { - -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION if ( g.csw ) { for( i=0; i<12; i++ ) //0-23 buffer[i+12] = buffer[i] = (complex_double) clover_pt[i]; @@ -1694,41 +1511,6 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct clover_pt += cs; selfcoupling_LU_doublet_decomposition_PRECISION( clover_oo_inv_pt, buffer ); clover_oo_inv_pt += 288; -#else - if ( g.csw ) { - sse_site_clover_doublet_invert_PRECISION( clover_pt, eps_term_pt, clover_oo_inv_pt ); - } else { -#ifdef HAVE_TM - for ( i=0; i<6; i++ ) { //we temporaly save in clover_oo_inv_pt - clover_oo_inv_pt[2*i] = clover_pt[2*i] + creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+1] = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+12] = clover_pt[2*i] - creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); - } - for ( i=6; i<12; i++ ) { - clover_oo_inv_pt[2*i+12] = clover_pt[2*i] + creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+24] = clover_pt[2*i] - creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+25] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); - } - tm_term_pt += 12; -#else - for ( i=0; i<6; i++ ) { - clover_oo_inv_pt[2*i+12] = clover_oo_inv_pt[2*i] = clover_pt[2*i]; - clover_oo_inv_pt[2*i+13] = clover_oo_inv_pt[2*i+1] = clover_pt[2*i+1]; - } - for ( i=6; i<12; i++ ) { - clover_oo_inv_pt[2*i+24] = clover_oo_inv_pt[2*i+12] = clover_pt[2*i]; - clover_oo_inv_pt[2*i+25] = clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1]; - } -#endif - sse_site_clover_doublet_invert_PRECISION( clover_oo_inv_pt, eps_term_pt, clover_oo_inv_pt ); - } - - clover_pt += cs; - eps_term_pt += 12; - clover_oo_inv_pt += 2*288; -#endif } } } @@ -1751,14 +1533,6 @@ void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, START_UNTHREADED_FUNCTION(threading) - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - //we don't have the LU decomposition here, for debugging only - int n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites, nv = l->num_lattice_site_var; - clover_PRECISION( eta, phi, &(s->op), start+nv*n1, start+nv*(n1+n2), l, threading ); - -#else - int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { @@ -1794,8 +1568,6 @@ void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, #ifdef HAVE_TM1p1 } #endif - -#endif END_UNTHREADED_FUNCTION(threading) } @@ -1811,42 +1583,19 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, if ( g.n_flavours == 2 ) { int block_num = start/24/(n1+n2); -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION config_PRECISION clover = s->op.clover_doublet_oo_inv-(block_num+1)*n1*288; LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start ); -#else - PRECISION *clover_vectorized = s->op.clover_doublet_oo_inv_vectorized + (start/24-block_num*n1)*2*288; - vector_PRECISION lphi, leta; - lphi.vector_buffer = phi->vector_buffer+n1*24+start; - leta.vector_buffer = eta->vector_buffer+n1*24+start; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*42; LLH_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #else config_PRECISION clover = s->op.clover_oo_inv-(block_num+1)*n1*72; LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); -#endif -#else - PRECISION *clover_vectorized = s->op.clover_oo_inv_vectorized + (start/12-block_num*n1)*144; - vector_PRECISION lphi, leta; - lphi.vector_buffer = phi->vector_buffer+n1*12+start; - leta.vector_buffer = eta->vector_buffer+n1*12+start; - for ( i=0; iop.clover+n1*12+start; @@ -1878,26 +1627,6 @@ void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd, **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; - - for ( int mu=0; mu<4; mu++ ) { - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[mu]; - a2 = n1; n2 = a2 + length_odd[mu]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[mu]; n1 = a1 + length_odd[mu]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[mu]+length_odd[mu]; - a2 = 0; n2 = n1; - } - block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dplus, (PRECISION*)(phi->vector_buffer+start), mu, a1, n1, index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dminus, (PRECISION*)(phi->vector_buffer+start), mu, a2, n2, index[mu], neighbor ); - } - -#else config_PRECISION D = s->op.D + (start/nv)*36; int i, j, k, *ind; config_PRECISION D_pt; @@ -2157,7 +1886,6 @@ void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, } #ifdef HAVE_TM1p1 } -#endif #endif END_UNTHREADED_FUNCTION(threading) } @@ -2171,26 +1899,6 @@ void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *ph int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd, **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; - - for ( int mu=0; mu<4; mu++ ) { - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[mu]; - a2 = n1; n2 = a2 + length_odd[mu]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[mu]; n1 = a1 + length_odd[mu]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[mu]+length_odd[mu]; - a2 = 0; n2 = n1; - } - block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dplus, (PRECISION*)(phi->vector_buffer+start), mu, a1, n1, index[mu], neighbor ); - block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta->vector_buffer+start), Dminus, (PRECISION*)(phi->vector_buffer+start), mu, a2, n2, index[mu], neighbor ); - } - -#else int i, j, k, *ind; buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; config_PRECISION D_pt, D = s->op.D + (start/nv)*36; @@ -2451,7 +2159,6 @@ void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *ph #ifdef HAVE_TM1p1 } #endif -#endif END_UNTHREADED_FUNCTION(threading) } diff --git a/src/operator_generic.c b/src/operator_generic.c index da79ea4..bbda504 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -29,12 +29,8 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { op->backward_neighbor_table = NULL; op->translation_table = NULL; op->D = NULL; - op->D_vectorized = NULL; - op->D_transformed_vectorized = NULL; op->clover = NULL; op->clover_oo_inv = NULL; - op->clover_vectorized = NULL; - op->clover_oo_inv_vectorized = NULL; op->m0 = 0; #ifdef HAVE_TM op->mu = 0; @@ -49,8 +45,6 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { op->epsbar_ig5_odd_shift = 0; op->epsbar_term = NULL; op->clover_doublet_oo_inv = NULL; - op->clover_doublet_vectorized = NULL; - op->clover_doublet_oo_inv_vectorized = NULL; #endif for ( int mu=0; mu<4; mu++ ) @@ -144,8 +138,6 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le MALLOC( op->translation_table, int, l->num_inner_lattice_sites ); if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - if( g.csw ) { #ifdef HAVE_TM //we use LU here MALLOC( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) ); @@ -155,15 +147,6 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le } #ifdef HAVE_TM1p1 MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) ); -#endif - -#else - if( g.csw ) - MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION ); -#ifdef HAVE_TM1p1 - MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION ); -#endif - #endif } @@ -224,8 +207,6 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev FREE( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); #endif if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - if( g.csw ) { #ifdef HAVE_TM //we use LU here FREE( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) ); @@ -235,15 +216,6 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev } #ifdef HAVE_TM1p1 FREE( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) ); -#endif - -#else - if( g.csw ) - FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1) ); -#ifdef HAVE_TM1p1 - FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1) ); -#endif - #endif } @@ -338,45 +310,9 @@ void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_stru } void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l ) { - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - int i, n = 2*l->num_lattice_sites - l->num_inner_lattice_sites; - - for ( i=0; iD_vectorized + 96*i; - PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i; - complex_PRECISION *D_pt = op->D + 36*i; - for ( int mu=0; mu<4; mu++ ) - set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_pt+9*mu ); - } -#endif - } void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l ) { - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - int i, n = l->num_inner_lattice_sites; - - if ( g.csw != 0 ) - for ( i=0; iclover_vectorized + 144*i; - config_PRECISION clover_pt = op->clover + 42*i; - sse_set_clover_PRECISION( clover_vectorized_pt, clover_pt ); -#ifdef HAVE_TM1p1 - PRECISION *clover_doublet_vectorized_pt = op->clover_doublet_vectorized + 288*i; - sse_set_clover_doublet_PRECISION( clover_doublet_vectorized_pt, clover_pt ); -#endif -#ifdef HAVE_TM - config_PRECISION tm_term_pt = op->tm_term + 12*i; - sse_add_diagonal_clover_PRECISION( clover_vectorized_pt, tm_term_pt ); -#ifdef HAVE_TM1p1 - sse_add_diagonal_clover_doublet_PRECISION( clover_doublet_vectorized_pt, tm_term_pt ); -#endif -#endif - } -#endif - } void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index fa70c36..ee5dc4b 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -198,20 +198,6 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { MALLOC( s->local_minres_buffer[1], complex_PRECISION, svs ); MALLOC( s->local_minres_buffer[2], complex_PRECISION, svs ); -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - if ( l->depth == 0 ) { - MALLOC_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION ); - MALLOC_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION ); - } -#endif -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if ( l->depth == 0 ) { - MALLOC_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); -#ifdef HAVE_TM1p1 - MALLOC_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); -#endif - } -#endif } @@ -290,20 +276,6 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { s->local_minres_buffer[1] = NULL; s->local_minres_buffer[2] = NULL; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - if ( l->depth == 0 ) { - FREE_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) ); - FREE_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) ); - } -#endif -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if ( l->depth == 0 ) { - FREE_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size ); -#ifdef HAVE_TM1p1 - FREE_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size ); -#endif - } -#endif } @@ -742,17 +714,6 @@ void block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized; - PRECISION *Dminus = s->op.D_transformed_vectorized; - - for ( int mu=0; mu<4; mu++ ) { - boundary_plus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dplus, (PRECISION*)phi->vector_buffer, - mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_minus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dminus, (PRECISION*)phi->vector_buffer, - mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); - } -#else int i, mu, index, neighbor_index; config_PRECISION D_pt, D = s->op.D; buffer_PRECISION phi_pt, eta_pt; @@ -988,7 +949,6 @@ void block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, #ifdef HAVE_TM1p1 } #endif -#endif } @@ -996,17 +956,6 @@ void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized; - PRECISION *Dminus = s->op.D_transformed_vectorized; - - for ( int mu=0; mu<4; mu++ ) { - boundary_nplus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dplus, (PRECISION*)phi->vector_buffer, - mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_nminus_coupling_PRECISION( (PRECISION*)eta->vector_buffer, Dminus, (PRECISION*)phi->vector_buffer, - mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); - } -#else int i, mu, index, neighbor_index; config_PRECISION D_pt, D = s->op.D; buffer_PRECISION phi_pt, eta_pt; @@ -1241,7 +1190,6 @@ void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi } #ifdef HAVE_TM1p1 } -#endif #endif } @@ -1251,33 +1199,6 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset; - - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; - // plus mu direction - for ( int i=bbl[2*mu]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt, eta_pt; - phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; - eta_pt.vector_buffer = eta->vector_buffer + n*index; - coarse_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dplus + 4*vectorized_link_offset*index, l ); - } - // minus mu direction - for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt, eta_pt; - phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; - eta_pt.vector_buffer = eta->vector_buffer + n*index; - coarse_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); - } - } -#else config_PRECISION D = s->op.D; int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size; @@ -1303,7 +1224,6 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION coarse_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } } -#endif } @@ -1311,33 +1231,6 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISI int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset; - - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; - // plus mu direction - for ( int i=bbl[2*mu]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt, eta_pt; - phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; - eta_pt.vector_buffer = eta->vector_buffer + n*index; - coarse_n_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dplus + 4*vectorized_link_offset*index, l ); - } - // minus mu direction - for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt, eta_pt; - phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; - eta_pt.vector_buffer = eta->vector_buffer + n*index; - coarse_n_hopp_PRECISION_vectorized( &eta_pt, &phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); - } - } -#else int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size; config_PRECISION D = s->op.D; @@ -1363,7 +1256,6 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISI coarse_n_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } } -#endif } diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h index 26333ad..2bc22d7 100644 --- a/src/schwarz_generic.h +++ b/src/schwarz_generic.h @@ -76,22 +76,4 @@ struct Thread; } } -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float -static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) { - // out1: column major, out2: row major - for ( int i=0; i<3; i++ ) { // column - for ( int j=0; j<3; j++ ) { // row - out1[8*i +j] = creal_PRECISION(in[3*j+i]); - out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]); - out2[8*i +j] = creal_PRECISION(in[j+3*i]); - out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]); - } - out1[8*i+3] = 0.0; - out1[8*i+7] = 0.0; - out2[8*i+3] = 0.0; - out2[8*i+7] = 0.0; - } -} -#endif - #endif diff --git a/src/setup_generic.c b/src/setup_generic.c index 5570a60..e91c3c9 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -32,13 +32,8 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) coarse_operator_PRECISION_alloc( l ); -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); END_LOCKED_MASTER(threading) -#else - END_LOCKED_MASTER(threading) - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); -#endif START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { @@ -266,23 +261,16 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T } } -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION for ( k=0; kis_PRECISION.interpolation[k]), &(l->is_PRECISION.test_vector[k]), start, end, l ); } -#endif testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading ); -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading ); -#else gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading ); define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); -#endif } @@ -291,14 +279,6 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - if ( l->depth > 0 ) - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); - START_LOCKED_MASTER(threading) -#else for ( int i=0; inum_eig_vect; i++ ) { vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]), threading->start_index[l->depth], threading->end_index[l->depth], l ); @@ -309,7 +289,7 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif + conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( !l->next_level->idle && l->next_level->level > 0 ) { @@ -390,14 +370,6 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s END_MASTER(threading) #endif -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - if ( l->depth > 0 ) - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); - START_LOCKED_MASTER(threading) -#else for ( int i=0; inum_eig_vect; i++ ) vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]), threading->start_index[l->depth], threading->end_index[l->depth], l ); @@ -407,7 +379,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif + conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( !l->next_level->idle && l->next_level->level > 0 ) { diff --git a/src/threading.c b/src/threading.c index d793c3a..aa731f9 100644 --- a/src/threading.c +++ b/src/threading.c @@ -116,12 +116,8 @@ void setup_no_threading(struct Thread *no_threading, struct level_struct *l) void compute_core_start_end(int start, int end, int *core_start, int *core_end, struct level_struct *l, struct Thread *threading) { -#ifdef SSE - int min_per_core = 2*l->num_lattice_site_var; -#else // due to loop unrolling in low level functions int min_per_core = 3*40; -#endif // printf0("min_per_core = %d\n", min_per_core ); compute_core_start_end_custom(start, end, core_start, core_end, l, threading, min_per_core); } diff --git a/src/vector_generic.c b/src/vector_generic.c index 2e8a01e..19a8434 100644 --- a/src/vector_generic.c +++ b/src/vector_generic.c @@ -71,16 +71,9 @@ void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, in if(thread == 0 && start != end) PROF_PRECISION_START( _SET ); if ( phi->vector_buffer != NULL ) { - //int i; - //for ( i=start; ivector_buffer[i] = value; - for(int i=start; ivector_buffer[i+j] = value; + int i; + for ( i=start; ivector_buffer[i] = value; } else { error0("Error in \"vector_PRECISION_define\": pointer is null\n"); } @@ -119,39 +112,10 @@ void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, comp if(thread == 0 && start != end) PROF_PRECISION_START( _RS ); - if(z == x){ -#ifdef OPTIMIZE - PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, r_alpha = creal_PRECISION(alpha); -#else - PRECISION *r_z = (PRECISION*)z->vector_buffer, r_alpha = creal_PRECISION(alpha); -#endif - int r_start = 2*start, r_end = 2*end; + PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); + int r_start = 2*start, r_end = 2*end; - //REAL_VECTOR_FOR( int i=r_start, ivector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); -#else - PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); -#endif - int r_start = 2*start, r_end = 2*end; - - //REAL_VECTOR_FOR( int i=r_start, iinner_vector_size ); @@ -204,12 +168,12 @@ void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, } else { // PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer; for( i=start; inum_vect; j+=num_loop) - #pragma unroll + //for( j=0; jnum_vect; j+=num_loop) + /* #pragma unroll #pragma vector aligned #pragma ivdep - for( k=0; kvector_buffer[i*x->num_vect+j+k] = r_alpha[j+k]*x->vector_buffer[i*x->num_vect+j+k]; + for( k=0; kvector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j];) } //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); @@ -224,24 +188,11 @@ void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, if(z == x) return; - //vector_PRECISION_check_comp( z, x ); - //z->layout = x->layout; -#ifdef OPTIMIZE - buffer_PRECISION restrict z_pt=z->vector_buffer, restrict x_pt=x->vector_buffer; -#else buffer_PRECISION z_pt=z->vector_buffer, x_pt=x->vector_buffer; -#endif int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _CPY ); - //VECTOR_FOR( int i=start, iinner_vector_size ); From 7371a329432632a4a15867a3e9d4b890240d1116 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 10 Dec 2018 11:35:46 +0200 Subject: [PATCH 29/31] delete sse 2 --- src/linalg_generic.c | 73 ++++++++++++++++++++++++++------------------ src/main.h | 4 +-- src/vector_generic.c | 2 +- 3 files changed, 47 insertions(+), 32 deletions(-) diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 021ac44..d0ecaf1 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -158,19 +158,21 @@ void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *re PROF_PRECISION_START( _PIP, threading ); int i, j, k; - for(int c=0; cnum_vect; c+=num_loop) + /*for(int c=0; cnum_vect; c+=num_loop) #pragma unroll #pragma vector aligned for( k=0; knum_vect, results[j] = 0.0;) for(int c=0; cnum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned for( k=0; knum_vect+j+k] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+k])*psi->vector_buffer[i*psi->num_vect+j+k]; + results[c*psi->num_vect+j+k] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+k])*psi->vector_buffer[i*psi->num_vect+j+k];*/ + vector_loop(j, psi->num_vect, results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); @@ -274,22 +276,25 @@ void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struc PROF_PRECISION_START( _GIP, threading ); int i, j, k; - for( j=0; jnum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned for( k=0; knum_vect, res[j]=0;) for( i=start; inum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll for( k=0; kvector_buffer[i*x->num_vect+j+k]); + res[j+k] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j+k]);*/ + vector_loop(j, x->num_vect, res[j] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j]);) - for( j=0; jnum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll for( k=0; knum_vect, res[j] = (PRECISION)sqrt((double)res[j]);) if(thread == 0 && start != end) PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); @@ -318,12 +323,13 @@ void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector PROF_PRECISION_START( _LA2 ); for( i=start; inum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned #pragma ivdep for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] + y->vector_buffer[i*x->num_vect+j+k]; + z->vector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] + y->vector_buffer[i*x->num_vect+j+k];*/ + vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + y->vector_buffer[i*x->num_vect+j];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -345,19 +351,20 @@ void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PR void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { - int i, j, k, start, end; + int i, j, jj, start, end; compute_core_start_end(0, y->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); for( i=start; inum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned #pragma ivdep for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] - y->vector_buffer[i*x->num_vect+j+k]; + z->vector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] - y->vector_buffer[i*x->num_vect+j+k];*/ + vector_loop2(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - y->vector_buffer[i*x->num_vect+j+jj];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -384,12 +391,13 @@ void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, compl PROF_PRECISION_START( _LA6 ); for( i=start; inum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned #pragma ivdep for( n=0; nvector_buffer[i*x->num_vect+j+n] = alpha[k*x->num_vect+j+n]*x->vector_buffer[i*x->num_vect+j+n]; + z->vector_buffer[i*x->num_vect+j+n] = alpha[k*x->num_vect+j+n]*x->vector_buffer[i*x->num_vect+j+n];*/ + vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); @@ -416,13 +424,13 @@ void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, co void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); - //if(thread == 0 && start != end) - //PROF_PRECISION_START( _CPY ); + if(thread == 0 && start != end) + PROF_PRECISION_START( _CPY ); VECTOR_FOR( int i=start, iinner_vector_size ); + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) { @@ -437,6 +445,9 @@ void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PR PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); } +// New input variable: sign +// sign == 1 : plus +// else: minus void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ) { int i, j, n, start, end; @@ -447,20 +458,22 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto if( sign == 1 ) for( i=start; inum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned #pragma ivdep for( n=0; nvector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] + alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n]; + z->vector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] + alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n];*/ + vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j];) else for( i=start; inum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned #pragma ivdep for( n=0; nvector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] - alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n]; + z->vector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] - alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n];*/ + vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j];) if( thread == 0 && start != end ) @@ -500,20 +513,22 @@ void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION alpha_signed[count*z->num_vect]; for ( c=0; cnum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned for( k=0; knum_vect+j+k] = sign*alpha[c*z->num_vect+j+k]; + alpha_signed[c*z->num_vect+j+k] = sign*alpha[c*z->num_vect+j+k];*/ + vector_loop(j, z->num_vect, alpha_signed[c*z->num_vect+j] = sign*alpha[c*z->num_vect+j];) for ( c=0; cnum_vect; j+=num_loop) + /*for( j=0; jnum_vect; j+=num_loop) #pragma unroll #pragma vector aligned #pragma ivdep for( k=0; kvector_buffer[i*z->num_vect+j+k] += V[c].vector_buffer[i*z->num_vect+j+k]*alpha_signed[c]; + z->vector_buffer[i*z->num_vect+j+k] += V[c].vector_buffer[i*z->num_vect+j+k]*alpha_signed[c];*/ + vector_loop(j, z->num_vect, z->vector_buffer[i*z->num_vect+j] += V[c].vector_buffer[i*z->num_vect+j]*alpha_signed[c];) if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); diff --git a/src/main.h b/src/main.h index 1629c5e..b9cbd30 100644 --- a/src/main.h +++ b/src/main.h @@ -34,8 +34,8 @@ #define num_loop 4 - // #define vector_loop(k, instructions) _Pragma("unroll") _Pragma("vector aligned") _Pragma("ivdep") for(k=0; kvector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j];) + vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j];) } //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); From dd9c21d56afe9225a680bddd347c3ddfd671e6b4 Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Mon, 10 Dec 2018 11:41:09 +0200 Subject: [PATCH 30/31] Deleted files --- src/blas_vectorized.h | 70 - src/sse_blas_vectorized.h | 381 ---- src/sse_coarse_operator.h | 496 ----- src/sse_coarse_operator_generic.c | 962 --------- src/sse_coarse_operator_generic.h | 311 --- src/sse_complex_double_intrinsic.h | 90 - src/sse_complex_float_intrinsic.h | 165 -- src/sse_dirac.c | 2971 ---------------------------- src/sse_dirac.h | 606 ------ src/sse_dirac_su3local.h | 493 ----- src/sse_double_intrinsic.h | 61 - src/sse_float_intrinsic.h | 92 - src/sse_interpolation_generic.c | 672 ------- src/sse_interpolation_generic.h | 36 - src/sse_linalg.c | 795 -------- src/sse_linalg.h | 497 ----- src/sse_linalg_generic.c | 245 --- src/sse_linalg_generic.h | 54 - src/vectorization_control.h | 52 - src/vectorization_dirac_generic.c | 116 -- src/vectorization_dirac_generic.h | 136 -- 21 files changed, 9301 deletions(-) delete mode 100644 src/blas_vectorized.h delete mode 100644 src/sse_blas_vectorized.h delete mode 100644 src/sse_coarse_operator.h delete mode 100644 src/sse_coarse_operator_generic.c delete mode 100644 src/sse_coarse_operator_generic.h delete mode 100644 src/sse_complex_double_intrinsic.h delete mode 100644 src/sse_complex_float_intrinsic.h delete mode 100644 src/sse_dirac.c delete mode 100644 src/sse_dirac.h delete mode 100644 src/sse_dirac_su3local.h delete mode 100644 src/sse_double_intrinsic.h delete mode 100644 src/sse_float_intrinsic.h delete mode 100644 src/sse_interpolation_generic.c delete mode 100644 src/sse_interpolation_generic.h delete mode 100644 src/sse_linalg.c delete mode 100644 src/sse_linalg.h delete mode 100644 src/sse_linalg_generic.c delete mode 100644 src/sse_linalg_generic.h delete mode 100644 src/vectorization_control.h delete mode 100644 src/vectorization_dirac_generic.c delete mode 100644 src/vectorization_dirac_generic.h diff --git a/src/blas_vectorized.h b/src/blas_vectorized.h deleted file mode 100644 index 645c457..0000000 --- a/src/blas_vectorized.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef BLAS_VECTORIZED_H -#define BLAS_VECTORIZED_H - -// BLAS naming convention: LDA = leading dimension of A -#ifdef SSE -#include "sse_blas_vectorized.h" -#endif - -// C=A*B+C -static inline void cgemv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C) -{ -#ifdef SSE - sse_cgemv( N, A, lda, B, C ); -#endif -} - -// C=-A*B+C -static inline void cgenmv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C) -{ -#ifdef SSE - sse_cgenmv( N, A, lda, B, C ); -#endif -} - -// C=A*B+C with padded layout -static inline void cgemv_padded(const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C) -{ -#ifdef SSE - sse_cgemv_padded( N, A, lda, padded, B, C ); -#endif -} - -// C=-A*B+C with padded layout -static inline void cgenmv_padded(const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C) -{ -#ifdef SSE - sse_cgenmv_padded( N, A, lda, padded, B, C ); -#endif -} - - -static inline void cgem_inverse(const int N, OPERATOR_TYPE_float *A_inverse, OPERATOR_TYPE_float *A, int lda) -{ -#ifdef SSE - sse_cgem_inverse( N, A_inverse, A, lda ); -#endif -} - -#endif // BLAS_VECTORIZED_H diff --git a/src/sse_blas_vectorized.h b/src/sse_blas_vectorized.h deleted file mode 100644 index df99468..0000000 --- a/src/sse_blas_vectorized.h +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_BLAS_VECTORIZED_H -#define SSE_BLAS_VECTORIZED_H -#ifdef SSE - -static inline void sse_cgem_inverse( const int N, float *A_inverse, float *A, int lda ) { - // generate LU decomp in A - - int i, j, k; - complex_float alpha; - - complex_float tmpA[N*N]; - complex_float tmpA_inverse[N*N]; - - for ( j=0; j0 ) - b[k-1] = 0; - - for ( i=0; i=0; i-- ) { - for ( j=i+1; j= j*offset; i -= SIMD_LENGTH_float ) { - ip = i%offset + 2*(i/offset)*padded; - A_re = _mm_unpacklo_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*ip, A_re ); - _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im ); - A_re = _mm_unpacklo_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*(ip+padded), A_re ); - _mm_store_ps( C+2*(ip+padded)+SIMD_LENGTH_float, A_im ); - } - } - } else { -#endif - __m128 A_re; - __m128 A_im; - __m128 B_re; - __m128 B_im; - __m128 C_re[lda/SIMD_LENGTH_float]; - __m128 C_im[lda/SIMD_LENGTH_float]; - - // deinterleaved load - for ( i=0; i= j*offset; i -= SIMD_LENGTH_float ) { - ip = i%offset + 2*(i/offset)*padded; - A_re = _mm_unpacklo_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*ip, A_re ); - _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im ); - A_re = _mm_unpacklo_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*(ip+padded), A_re ); - _mm_store_ps( C+2*(ip+padded)+SIMD_LENGTH_float, A_im ); - } - } - } else { -#endif - __m128 A_re; - __m128 A_im; - __m128 B_re; - __m128 B_im; - __m128 C_re[lda/SIMD_LENGTH_float]; - __m128 C_im[lda/SIMD_LENGTH_float]; - - // deinterleaved load - for ( i=0; inext_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2; - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_float*site); - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - - // index k used for vectorization - for ( k=0; kvector_size + fine_components*component_offset*site); - - // B - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - - -static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3, - complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp ) { - -#ifdef SSE - int k, k1, k2, m, num_eig_vect = l->next_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2; - - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D, each column wise - for ( int n=0; nvector_size + fine_components*component_offset*site); - - k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // C - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_0_1 is the same for all k => broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - - k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - - // B - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - - -static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2, - complex_float *phi, config_float clover, int elements, level_struct *l ) { - -#ifdef SSE - int num_eig_vect = l->num_lattice_site_var/2; - int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2; - complex_float *eta[2] = {eta1, eta2}; - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - - __m128 clover_re; - __m128 clover_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - - // zero output matrices - __m128 zero = _mm_setzero_ps(); - for(int s=0; s<2; s++) { - for(int i=0; ieta1) or 2and3 (->eta2) - eta[1] += num_eig_vect*elements; - for(int s=0; s<2; s++) { - // A and D: column major hermitian, stored as upper triangular - for(int i=0; inum_parent_eig_vect; - int block_step_size = (num_eig_vect * (num_eig_vect+1))/2; - complex_float *eta[2] = {eta1, eta2}; - // U(x) = [ A 0 , A=A*, D=D* - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - - __m128 block_re; - __m128 block_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - - // zero output matrices - __m128 zero = _mm_setzero_ps(); - for(int s=0; s<2; s++) { - for(int i=0; ieta1) or 2and3 (->eta2) - eta[1] += num_eig_vect*elements; - for(int s=0; s<2; s++) { - // A and D: column major hermitian, stored as upper triangular - for(int i=0; inext_level->num_parent_eig_vect, - offset = l->num_parent_eig_vect; - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U(x) = [ A 0 , A=A*, D=D* - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_float*site); - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - -#endif //SSE -#endif diff --git a/src/sse_coarse_operator_generic.c b/src/sse_coarse_operator_generic.c deleted file mode 100644 index cde7a51..0000000 --- a/src/sse_coarse_operator_generic.c +++ /dev/null @@ -1,962 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#include "sse_coarse_operator.h" - -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION -void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ) { - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) - - double t0, t1; - t0 = MPI_Wtime(); - - int mu, j, n = l->num_eig_vect, num_aggregates = l->is_PRECISION.num_agg, - aggregate_sites = l->num_inner_lattice_sites / num_aggregates, - clover_site_size = (l->num_eig_vect*(l->num_eig_vect*2+1)), - block_site_size = (l->num_eig_vect*(l->num_eig_vect+1)), - D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4, // size of links in all 4 directions - fine_components = l->num_lattice_site_var; - - - - START_LOCKED_MASTER(threading) - operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - // each thread loops overs its aggregates and then over internal d.o.f. - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - for ( j=0; jnext_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO; - for ( j=0; jnext_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO; - for ( j=0; jnext_level->op_PRECISION.odd_proj[j+a*block_site_size] = _COMPLEX_PRECISION_ZERO; - } - - complex_PRECISION *mpi_buffer = NULL; - START_MASTER(threading) - MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 ); - END_MASTER(threading) - - int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]]; - - // set up table for direction flags - int *flags = direction_flags; - if(l->depth == 0) { - // even sites - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - if((x+y+z+t)%2 == 0) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - // odd sites - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - if((x+y+z+t)%2 == 1) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - } else { - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - - complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - if(l->depth == 0) { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site, - direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); - } else { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site, - direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); - } - set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); - } - - // aggregate is done, finalize - set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); - - } - - - SYNC_HYPERTHREADS(threading) - START_LOCKED_MASTER(threading) - // neighbors - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) { - for ( mu=0; mu<4; mu++ ) { - // determine start of buffer for this mu - int start = 0; - for ( int j=0; js_PRECISION.op.c.num_boundary_sites[2*j]; - - // update ghost cells of V[i] - negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l, - SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION ); - } - for ( mu=0; mu<4; mu++ ) { - // finish updating ghostcells of V[i] - negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); - } - } - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - for ( mu=0; mu<4; mu++ ) { - if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0) - continue; - - if(l->depth == 0) - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); - else - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); - set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); - } - } - - // aggregate is done, finalize - for ( mu=0; mu<4; mu++ ) - set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); - } - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - if(l->depth == 0) { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - diagonal_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site ); - } else { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_block_diagonal_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site ); - } - set_coarse_block_diagonal_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); - } - - // aggregate is done, finalize - set_coarse_block_diagonal_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); - } - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) - - coarse_operator_PRECISION_setup_finalize( l, threading ); - - START_MASTER(threading) - FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) ); - - t1 = MPI_Wtime(); - if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 ); - END_MASTER(threading) - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) -} -#endif - -void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - sse_set_coarse_self_coupling_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp ); -} - -void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - sse_set_coarse_block_diagonal_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp ); -} - -void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, - num_eig_vect = l->next_level->num_lattice_site_var/2, - aggregate_size = l->inner_vector_size / num_aggregates, - clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2; - int t1, t2; - - config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/aggregate_size; - clover_pt = clover + aggregate*clover_site_size; - - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nnext_level->num_lattice_site_var/2, - D_link_size = num_eig_vect*num_eig_vect*4; - int t1, t2; - - config_PRECISION D_pt, D = l->next_level->op_PRECISION.D; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg); - D_pt = D + (4*aggregate+mu)*D_link_size; - - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D, each column wise - for ( int n=0; nis_PRECISION.num_agg, - num_eig_vect = l->next_level->num_parent_eig_vect, - aggregate_size = l->inner_vector_size / num_aggregates, - block_site_size = (l->next_level->num_parent_eig_vect*(l->next_level->num_parent_eig_vect+1)); - int t1, t2; - - config_PRECISION block_pt, block = l->next_level->op_PRECISION.odd_proj; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/aggregate_size; - block_pt = block + aggregate*block_site_size; - - // U(x) = [ A 0 , A=A*, D=D* - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; n i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]); - out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]); - // C = -B^dagger - out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]); - out_tmp[(2*i+1)*column_offset + j + vecs] = cimag(clover[offset_to_B + j*vecs+i]); - } - // zero - for(int j=2*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]); - out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); - } - // zero - for(int j=2*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // A - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_column+jp]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] = sign*cimag(clover[offset_to_column+jp]); - // B - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_B + i*vecs+j]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 1*vecs] = cimag(clover[offset_to_B + i*vecs+j]); - // C = -B^dagger - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 3*vecs] = -creal(clover[offset_to_B + j*vecs+i]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 3*vecs] = cimag(clover[offset_to_B + j*vecs+i]); - // D - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] = creal(clover[offset_to_D + offset_to_column+jp]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); - // 0 - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] = 0.0; - } - // zero - for(int j=4*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // E - out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]); - // F - out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_F + offset_to_column+jp]); - out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_F + offset_to_column+jp]); - } - } - tm_term += 2*offset_to_F; - // out_tmp is an alias for the actual output - out_tmp += 2*column_offset*2*vecs; - } -#endif -} - -void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized, - int num_aggregates, int num_eig_vect) { -#ifdef HAVE_TM - int vecs = num_eig_vect; - // in vectorized layout clover is stored column wise, but not split into ABCD - // each column is padded, such that next column can also start at 64B boundary - int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // offset between blocks in clover - int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal - - PRECISION *out_tmp = clover_vectorized; - - // we add/sub the tm term to cloverD_vectorized - // A0B0 E000 0000 - // 0A0B + 0000 - 0E00 - // C0D0 00F0 0000 - // 0C0D 0000 000F - // 0000 0000 0000 - // (column wise, size of zeros such that columns length is multiple of 64B) - - // 4 directions - for ( int a=0; a i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // E - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] += cimag(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] -= sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] -= cimag(tm_term[offset_to_column+jp]); - // F - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(tm_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] += cimag(tm_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] -= sign*creal(tm_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] -= cimag(tm_term[offset_to_F+offset_to_column+jp]); - } - } - tm_term += 2*offset_to_F; - // out_tmp is an alias for the actual output - out_tmp += 2*4*vecs*column_offset; - } -#endif -} - -void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, OPERATOR_TYPE_PRECISION *clover_vectorized, - int num_aggregates, int num_eig_vect) { -#ifdef HAVE_TM1p1 - int vecs = num_eig_vect; - // in vectorized layout clover is stored column wise, but not split into ABCD - // each column is padded, such that next column can also start at 64B boundary - int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // offset between blocks in clover - int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal - - PRECISION *out_tmp = clover_vectorized; - - // we add the eps term to cloverD_vectorized - // A0B0 0E00 - // 0A0B + E000 - // C0D0 000F - // 0C0D 00F0 - // 0000 0000 - // (column wise, size of zeros such that columns length is multiple of 64B) - - // 4 directions - for ( int a=0; a i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // E - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] += sign*creal(eps_term[offset_to_column+jp]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] += cimag(eps_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(eps_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] += cimag(eps_term[offset_to_column+jp]); - // F - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]); - } - } - eps_term += 2*offset_to_F; - // out_tmp is an alias for the actual output - out_tmp += 2*4*vecs*column_offset; - } -#endif -} - -void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site, int *direction_flags ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int index_bw; - int index_fw; - int *neighbor = s->op.neighbor_table; - int *backward_neighbor = s->op.backward_neighbor_table; - complex_PRECISION *phi_pt; - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - int n = l->num_lattice_site_var; - int D_site_offset = 4*n*n; - int D_link_offset = n*n; - int clover_offset = (n*(n+1))/2*site; - - coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l ); - - for(int mu=0; mu<4; mu++) { - index_fw = neighbor[5*site+1 + mu]; - index_bw = backward_neighbor[5*site+1 + mu]; - - // from backward - if ( direction_flags[2*mu+0] == 1 ) { - D_pt = D + D_site_offset*index_bw + D_link_offset*mu; - phi_pt = phi + site_offset*index_bw; - coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); - } - - // from forward - if ( direction_flags[2*mu+1] == 1 ) { - D_pt = D + D_site_offset*site + D_link_offset*mu; - phi_pt = phi + site_offset*index_fw; - coarse_spinwise_n_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); - } - } -} - -void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int n = l->num_parent_eig_vect; - int block_offset = (n*(n+1))*site; - - sse_coarse_aggregate_block_diagonal_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+block_offset, offset, l ); -} - -void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, - schwarz_PRECISION_struct *s, level_struct *l, int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int index_fw; - int *neighbor = s->op.neighbor_table; - complex_PRECISION *phi_pt; - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - int n = l->num_lattice_site_var; - int D_site_offset = 4*n*n; - int D_link_offset = n*n; - - buffer_PRECISION_define( eta1, 0, 0, n*offset, l ); - buffer_PRECISION_define( eta2, 0, 0, n*offset, l ); - - // requires the positive boundaries of phi to be communicated before - index_fw = neighbor[5*site+1 + mu]; - D_pt = D + D_site_offset*site + D_link_offset*mu; - phi_pt = phi + site_offset*index_fw; - coarse_spinwise_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); -} - - -void coarse_spinwise_site_self_couplings_PRECISION_vectorized( - complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) { - - sse_coarse_spinwise_site_self_couplings_PRECISION( eta1, eta2, phi, clover, elements, l ); -} - -#endif diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h deleted file mode 100644 index c9a0d3b..0000000 --- a/src/sse_coarse_operator_generic.h +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_COARSE_OPERATOR_PRECISION_HEADER - #define SSE_COARSE_OPERATOR_PRECISION_HEADER - - #ifdef SSE - - #include "blas_vectorized.h" - - void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ); - void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - // here we do not check whether site is really on boundary, caller is responsible for that - // tmp is used to store coarse operator with padding, until sum over all sites has been done - void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *buffer1, complex_PRECISION *buffer2, - complex_PRECISION *V, const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_block_diagonal_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - - void copy_coarse_operator_to_vectorized_layout_PRECISION(config_PRECISION D, - OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect); - // fw and bw links have a symmetry that allows constructing one from another, see, e.g., coarse_hopp_PRECISION - // for vectorization we store the operator for both cases, the "daggered" links need this transformed layout - void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D, - OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect); - void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(config_PRECISION clover, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - - void coarse_spinwise_site_self_couplings_PRECISION_vectorized( - complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ); - - void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ); - - void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ); - - void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site); - - - static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, - OPERATOR_TYPE_PRECISION *D, level_struct *l ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int nv = l->num_parent_eig_vect; - int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgenmv_padded( 2*nv, D, lda, nv, (float *)phi->vector_buffer, (float *)eta->vector_buffer); -#endif - } - static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, - OPERATOR_TYPE_PRECISION *D, level_struct *l ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int nv = l->num_parent_eig_vect; - int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgemv_padded( 2*nv, D, lda, nv, (float *)phi->vector_buffer, (float *)eta->vector_buffer); -#endif - } - - static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION *eta, vector_PRECISION *phi, - operator_PRECISION_struct *op, int start, int end, level_struct *l ) { -#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - int site_size = l->num_lattice_site_var; - int lda = SIMD_LENGTH_PRECISION*((site_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); -#ifdef HAVE_TM1p1 - OPERATOR_TYPE_PRECISION *clover = (g.n_flavours == 2) ? op->clover_doublet_vectorized:op->clover_vectorized; -#else - OPERATOR_TYPE_PRECISION *clover = op->clover_vectorized; -#endif - for(int i=start; ivector_buffer[i*site_size+j] = 0.0; - cgemv(site_size, clover+i*2*site_size*lda, lda, (float *)(phi->vector_buffer+i*site_size), (float *)(eta->vector_buffer+i*site_size)); - } -#endif - } - - static inline void coarse_spinwise_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) { - -#ifdef SSE - int num_eig_vect = l->num_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) - for(int s=0; s<2; s++) { - // t is the row of the input matrix (in 2x2 block form) - for(int t=0; t<2; t++) { - for(int i=0; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) - for(int s=0; s<2; s++) { - // t is the row of the input matrix (in 2x2 block form) - for(int t=0; t<2; t++) { - for(int i=0; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // A* - for(int i=0; i1?((k)*3+6):((k)*3)) -#define index_d_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) + gamma_offset[mu][spin] ] -#define index_d_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) - gamma_offset[mu][spin] +1 ] - -#define neighbor_coupling_file "sse_dirac_su3local.h" - -void prp_double( complex_double *prn[4], complex_double *phi, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end = (double*)(phi+end); - double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128d phi_pt1_re; __m128d phi_pt1_im; - - sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0) ); - __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt+4,mu,0), index_re(phi_pt,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt+4,mu,0), index_im(phi_pt,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - phi_pt += 24; - } -} - - -void prp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end = (float*)(phi+end); - float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] ); - __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0), - index_re(phi_pt+4,mu,0), index_re(phi_pt,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0), - index_im(phi_pt+4,mu,0), index_im(phi_pt,mu,1) ); - - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[24], phi_pt[26] ); - phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[25], phi_pt[27] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1), - index_re(phi_pt+24,mu,0), index_re(phi_pt+26,mu,0) ); - __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1), - index_im(phi_pt+24,mu,0), index_im(phi_pt+26,mu,0) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[28], phi_pt[30], phi_pt[32], phi_pt[34] ); - phi_pt1_im = _mm_setr_ps( phi_pt[29], phi_pt[31], phi_pt[33], phi_pt[35] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt+28,mu,0), index_re(phi_pt+24,mu,1), - index_re(phi_pt+26,mu,1), index_re(phi_pt+28,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt+28,mu,0), index_im(phi_pt+24,mu,1), - index_im(phi_pt+26,mu,1), index_im(phi_pt+28,mu,1) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt+=48; - } -} - - -void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end = (double*)(phi+end); - double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128d phi_pt1_re; __m128d phi_pt1_im; - - sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+12, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+16, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+20, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - phi_pt += 48; - } -} - - -void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end = (float*)(phi+end); - float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] ); - __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), - index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), - index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) ); - - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[12], phi_pt[14] ); - phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[13], phi_pt[15] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), - index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) ); - __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), - index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[16], phi_pt[18], phi_pt[20], phi_pt[22] ); - phi_pt1_im = _mm_setr_ps( phi_pt[17], phi_pt[19], phi_pt[21], phi_pt[23] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1), - index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1), - index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt+=48; - } -} - - -void prn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end_pt = (double*)(phi+end); - double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; - double *D_pt = ((double*)(op->D))+2*(start*3); - int *nb_pt = neighbor+((start/12)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128d in_re[3]; - __m128d in_im[3]; - - for ( int i=0; i<3; i++ ) { - in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] ); - in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); - } - - for ( int mu=0; mu<4; mu++ ) { - - __m128d v_re[3]; - __m128d v_im[3]; - - // calc spin projection - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( index_re(phi_pt+2*i,mu,0), index_re(phi_pt+2*i,mu,1) ); - v_im[i] = _mm_setr_pd( index_im(phi_pt+2*i,mu,0), index_im(phi_pt+2*i,mu,1) ); - v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); - v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); - } - - { - __m128d res_re[3]; - __m128d res_im[3]; - // load su(3) matrix and multiply - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] ); - cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[6+2*i] ); - buf_im = _mm_set1_pd( D_pt[7+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[12+2*i] ); - buf_im = _mm_set1_pd( D_pt[13+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - - { - double *pr_pt = pr[mu]+2*6*(*(nb_pt)); - for ( int i=0; i<3; i++ ) { - __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); - __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); - _mm_storeu_pd( pr_pt+0+2*i, out1 ); - _mm_storeu_pd( pr_pt+6+2*i, out2 ); - } - } - } - - D_pt += 18; - nb_pt++; - } - - phi_pt += 12*2; - } - -} - - -void prn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end_pt = (float*)(phi+end); - float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])}; - float *D_pt = (float*)(op->D_transformed_vectorized+2*(start*4)); - int *nb_pt = neighbor+((start/12)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128 in1[2]; - __m128 in2[2]; - - in1[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 ); - in1[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 ); - in2[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 ); - in2[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res1[2]; - __m128 res2[2]; - - { - // calc spin0 projection - res1[0] = _mm_setr_ps( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0), index_re(phi_pt+4,mu,0), 0 ); - res1[1] = _mm_setr_ps( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0), index_im(phi_pt+4,mu,0), 0 ); - __m128 in1_re = _mm_add_ps( in1[0], res1[0] ); - __m128 in1_im = _mm_add_ps( in1[1], res1[1] ); - - // calc spin1 projection - res1[0] = _mm_setr_ps( index_re(phi_pt,mu,1), index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1), 0 ); - res1[1] = _mm_setr_ps( index_im(phi_pt,mu,1), index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1), 0 ); - __m128 in2_re = _mm_add_ps( in2[0], res1[0] ); - __m128 in2_im = _mm_add_ps( in2[1], res1[1] ); - - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - } - - { - __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); - __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); - __m128 buf3 = _mm_unpacklo_ps( res2[0], res2[1] ); - - { - __m128 buf4 = _mm_unpackhi_ps( res2[0], res2[1] ); - buf2 = _mm_movelh_ps( buf2, buf3 ); - buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - } - { - float *pr_pt = pr[mu]+2*6*(*nb_pt); - _mm_storeu_ps( pr_pt, buf1 ); - _mm_storeu_ps( pr_pt+4, buf2 ); - _mm_storeu_ps( pr_pt+8, buf3 ); - } - } - nb_pt++; - D_pt += 24; - } - - phi_pt += 24; - } -} - - -void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end_pt = (double*)(phi+end); - double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; - double *D_pt = ((double*)(op->D))+2*(start/24*36); - int *nb_pt = neighbor+((start/24)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128d in_re[6]; - __m128d in_im[6]; - - for ( int i=0; i<3; i++ ) { - in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] ); - in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); - } - for ( int i=3; i<6; i++ ) { - in_re[i] = _mm_setr_pd( phi_pt[2*i+6], phi_pt[2*i+12] ); - in_im[i] = _mm_setr_pd( phi_pt[2*i+7], phi_pt[2*i+13] ); - } - - for ( int mu=0; mu<4; mu++ ) { - - __m128d v_re[6]; - __m128d v_im[6]; - - // calc spin projection - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( index_d_re(phi_pt+2*i,mu,0), index_d_re(phi_pt+2*i,mu,1) ); - v_im[i] = _mm_setr_pd( index_d_im(phi_pt+2*i,mu,0), index_d_im(phi_pt+2*i,mu,1) ); - v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); - v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( index_d_re(phi_pt+6+2*i,mu,0), index_d_re(phi_pt+6+2*i,mu,1) ); - v_im[i] = _mm_setr_pd( index_d_im(phi_pt+6+2*i,mu,0), index_d_im(phi_pt+6+2*i,mu,1) ); - v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); - v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); - } - - { - __m128d res_re[6]; - __m128d res_im[6]; - // load su(3) matrix and multiply - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] ); - cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_conj_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[6+2*i] ); - buf_im = _mm_set1_pd( D_pt[7+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[12+2*i] ); - buf_im = _mm_set1_pd( D_pt[13+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - - { - double *pr_pt = pr[mu]+2*12*(*(nb_pt)); - for ( int i=0; i<3; i++ ) { - __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); - __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); - _mm_storeu_pd( pr_pt+0+2*i, out1 ); - _mm_storeu_pd( pr_pt+6+2*i, out2 ); - } - for ( int i=3; i<6; i++ ) { - __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); - __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); - _mm_storeu_pd( pr_pt+ 6+2*i, out1 ); - _mm_storeu_pd( pr_pt+12+2*i, out2 ); - } - } - } - - D_pt += 18; - nb_pt++; - } - - phi_pt += 24*2; - } - -} - - -void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end_pt = (float*)(phi+end); - float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])}; - float *D_pt = (float*)(op->D_transformed_vectorized+2*(start/24*48)); - int *nb_pt = neighbor+((start/24)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128 in11[2]; - __m128 in21[2]; - __m128 in12[2]; - __m128 in22[2]; - - in11[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 ); - in11[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 ); - in21[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 ); - in21[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 ); - in12[0] = _mm_setr_ps( phi_pt[12], phi_pt[14], phi_pt[16], 0 ); - in12[1] = _mm_setr_ps( phi_pt[13], phi_pt[15], phi_pt[17], 0 ); - in22[0] = _mm_setr_ps( phi_pt[18], phi_pt[20], phi_pt[22], 0 ); - in22[1] = _mm_setr_ps( phi_pt[19], phi_pt[21], phi_pt[23], 0 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res11[2]; - __m128 res21[2]; - __m128 res12[2]; - __m128 res22[2]; - - { - // calc spin0 projection - res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), index_d_re(phi_pt+4,mu,0), 0 ); - res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), index_d_im(phi_pt+4,mu,0), 0 ); - __m128 in11_re = _mm_add_ps( in11[0], res11[0] ); - __m128 in11_im = _mm_add_ps( in11[1], res11[1] ); - - // calc spin1 projection - res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,1), index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), 0 ); - res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,1), index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), 0 ); - __m128 in21_re = _mm_add_ps( in21[0], res11[0] ); - __m128 in21_im = _mm_add_ps( in21[1], res11[1] ); - - // calc spin0 projection - res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0), index_d_re(phi_pt+16,mu,0), 0 ); - res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0), index_d_im(phi_pt+16,mu,0), 0 ); - __m128 in12_re = _mm_add_ps( in12[0], res12[0] ); - __m128 in12_im = _mm_add_ps( in12[1], res12[1] ); - - // calc spin1 projection - res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,1), index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1), 0 ); - res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,1), index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1), 0 ); - __m128 in22_re = _mm_add_ps( in22[0], res12[0] ); - __m128 in22_im = _mm_add_ps( in22[1], res12[1] ); - - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); - } - } - } - - float *pr_pt = pr[mu]+2*12*(*nb_pt); - { - __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] ); - __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] ); - __m128 buf3 = _mm_unpacklo_ps( res21[0], res21[1] ); - - { - __m128 buf4 = _mm_unpackhi_ps( res21[0], res21[1] ); - buf2 = _mm_movelh_ps( buf2, buf3 ); - buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - } - { - _mm_storeu_ps( pr_pt, buf1 ); - _mm_storeu_ps( pr_pt+4, buf2 ); - _mm_storeu_ps( pr_pt+8, buf3 ); - } - } - { - __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] ); - __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] ); - __m128 buf3 = _mm_unpacklo_ps( res22[0], res22[1] ); - - { - __m128 buf4 = _mm_unpackhi_ps( res22[0], res22[1] ); - buf2 = _mm_movelh_ps( buf2, buf3 ); - buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - } - { - _mm_storeu_ps( pr_pt+12, buf1 ); - _mm_storeu_ps( pr_pt+16, buf2 ); - _mm_storeu_ps( pr_pt+20, buf3 ); - } - } - nb_pt++; - D_pt += 24; - } - - phi_pt += 48; - } -} - - -void pbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) { - - double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; - double *eta_pt = (double*)(eta+start); - - __m128d gamma0[4]; - __m128d gamma1[4]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][0]], gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][1]], gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - for ( int i=start; iD))+2*(start*3); - double *eta_pt = (double*)(eta+start); - double *eta_end_pt = (double*)(eta+end); - double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; - int *nb_pt = neighbor+((start/12)*4); - - __m128d gamma0[4]; - __m128d gamma1[4]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128d res[12]; - for ( int i=0; i<12; i++ ) { - res[i] = _mm_loadu_pd( eta_pt + 2*i ); - } - - // --------------- - // mu = T - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) ); - res[3*gamma_co[T][0]+i] = _mm_sub_pd( res[3*gamma_co[T][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[3+i]) ); - res[3*gamma_co[T][1]+i] = _mm_sub_pd( res[3*gamma_co[T][1]+i], buf1 ); - } - } - } - // --------------- - // mu = Z - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) ); - res[3*gamma_co[Z][0]+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) ); - res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 ); - } - } - } - // --------------- - // mu = Y - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) ); - res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) ); - res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 ); - } - } - } - // --------------- - // mu = X - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) ); - res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) ); - res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 ); - } - } - } - // --------------- - - for ( int i=0; i<12; i++ ) { - _mm_storeu_pd( eta_pt + 2*i, res[i] ); - } - eta_pt+=24; - } - -} - - -void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, - int *neighbor, int start, int end ) { - - float *D_pt = (float*)(op->D_vectorized+2*(start*4)); - float *eta_pt = (float*)(eta+start); - float *eta_end_pt = (float*)(eta+end); - float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])}; - int *nb_pt = neighbor+((start/12)*4); - - __m128 gamma0[4][2]; - __m128 gamma1[4][2]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] ); - gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] ); - gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128 eta_lo1 = _mm_loadu_ps( eta_pt ); - __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 ); - __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 ); - __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 ); - - __m128 eta2_lo[2]; - __m128 eta2_hi[2]; - - eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 ); - eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 ); - eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 ); - eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res1[2]; - __m128 res2[2]; - - { - int j = 2*6*(*nb_pt); - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) ); - cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) ); - cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - } - - { - // store spin0 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); - __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); - eta_lo1 = _mm_sub_ps( eta_lo1, buf1 ); - eta_lo2 = _mm_sub_ps( eta_lo2, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); - eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); - } - - { - // store spin1 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] ); - __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] ); - eta_hi1 = _mm_sub_ps( eta_hi1, buf1 ); - eta_hi2 = _mm_sub_ps( eta_hi2, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); - eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); - } - - nb_pt++; - D_pt += 24; - } - - _mm_storeu_ps( eta_pt, eta_lo1 ); - _mm_storeu_ps( eta_pt+4, eta_lo2 ); - _mm_storeu_ps( eta_pt+6, eta_hi1 ); - _mm_storeu_ps( eta_pt+10, eta_hi2 ); - _mm_storeu_ps( eta_pt+12, eta2_lo[0] ); - _mm_storeu_ps( eta_pt+14, eta2_hi[0] ); - _mm_storeu_ps( eta_pt+18, eta2_lo[1] ); - _mm_storeu_ps( eta_pt+20, eta2_hi[1] ); - - eta_pt += 24; - } - -} - - -void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, - int *neighbor, int start, int end ) { - - double *D_pt = ((double*)(op->D))+2*(start/24*36); - double *eta_pt = (double*)(eta+start); - double *eta_end_pt = (double*)(eta+end); - double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; - int *nb_pt = neighbor+((start/24)*4); - - __m128d gamma0[4]; - __m128d gamma1[4]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128d res[24]; - for ( int i=0; i<24; i++ ) { - res[i] = _mm_loadu_pd( eta_pt + 2*i ); - } - - // --------------- - // mu = T - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[T]+j+6+2*i), *(pr[T]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[T]+j+7+2*i), *(pr[T]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) ); - res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+3]) ); - res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i+6]) ); - res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+9]) ); - res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - // mu = Z - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Z]+j+6+2*i), *(pr[Z]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Z]+j+7+2*i), *(pr[Z]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) ); - res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+3]) ); - res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i+6]) ); - res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+9]) ); - res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - // mu = Y - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Y]+j+6+2*i), *(pr[Y]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Y]+j+7+2*i), *(pr[Y]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) ); - res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+3]) ); - res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i+6]) ); - res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+9]) ); - res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - // mu = X - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[X]+j+6+2*i), *(pr[X]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[X]+j+7+2*i), *(pr[X]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) ); - res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+3]) ); - res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i+6]) ); - res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+9]) ); - res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - - for ( int i=0; i<24; i++ ) { - _mm_storeu_pd( eta_pt + 2*i, res[i] ); - } - eta_pt+=48; - } - -} - - -void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, - int *neighbor, int start, int end ) { - - float *D_pt = (float*)(op->D_vectorized+2*(start/24*48)); - float *eta_pt = (float*)(eta+start); - float *eta_end_pt = (float*)(eta+end); - float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])}; - int *nb_pt = neighbor+((start/24)*4); - - __m128 gamma0[4][2]; - __m128 gamma1[4][2]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] ); - gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] ); - gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128 eta_lo1 = _mm_loadu_ps( eta_pt ); - __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 ); - __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 ); - __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 ); - __m128 eta_lo3 = _mm_loadu_ps( eta_pt + 12 ); - __m128 eta_lo4 = _mm_loadu_ps( eta_pt + 16 ); - __m128 eta_hi3 = _mm_loadu_ps( eta_pt + 18 ); - __m128 eta_hi4 = _mm_loadu_ps( eta_pt + 22 ); - - __m128 eta2_lo[4]; - __m128 eta2_hi[4]; - - eta2_lo[0] = _mm_loadu_ps( eta_pt + 24 ); - eta2_hi[0] = _mm_loadu_ps( eta_pt + 26 ); - eta2_lo[1] = _mm_loadu_ps( eta_pt + 30 ); - eta2_hi[1] = _mm_loadu_ps( eta_pt + 32 ); - eta2_lo[2] = _mm_loadu_ps( eta_pt + 36 ); - eta2_hi[2] = _mm_loadu_ps( eta_pt + 38 ); - eta2_lo[3] = _mm_loadu_ps( eta_pt + 42 ); - eta2_hi[3] = _mm_loadu_ps( eta_pt + 44 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res1[4]; - __m128 res2[4]; - - { - int j = 2*12*(*nb_pt); - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) ); - cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) ); - cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+12) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+13) ); - cmul( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+18) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+19) ); - cmul( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+14) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+15) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+20) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+21) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+16) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+17) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+22) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+23) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); - } - } - } - - { - // store spin0 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); - __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); - eta_lo1 = _mm_sub_ps( eta_lo1, buf1 ); - eta_lo2 = _mm_sub_ps( eta_lo2, buf2 ); - } - { - __m128 buf1 = _mm_unpacklo_ps( res1[2], res1[3] ); - __m128 buf2 = _mm_unpackhi_ps( res1[2], res1[3] ); - eta_lo3 = _mm_sub_ps( eta_lo3, buf1 ); - eta_lo4 = _mm_sub_ps( eta_lo4, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - { - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); - eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); - } - { - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[2+gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[3-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][2]], buf3 ); - eta2_hi[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][2]], buf4 ); - } - } - { - // store spin1 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] ); - __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] ); - eta_hi1 = _mm_sub_ps( eta_hi1, buf1 ); - eta_hi2 = _mm_sub_ps( eta_hi2, buf2 ); - } - { - __m128 buf1 = _mm_unpacklo_ps( res2[2], res2[3] ); - __m128 buf2 = _mm_unpackhi_ps( res2[2], res2[3] ); - eta_hi3 = _mm_sub_ps( eta_hi3, buf1 ); - eta_hi4 = _mm_sub_ps( eta_hi4, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - { - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); - eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); - } - { - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[2+gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[3-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][3]], buf3 ); - eta2_hi[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][3]], buf4 ); - } - } - nb_pt++; - D_pt += 24; - } - - _mm_storeu_ps( eta_pt, eta_lo1 ); - _mm_storeu_ps( eta_pt+4, eta_lo2 ); - _mm_storeu_ps( eta_pt+6, eta_hi1 ); - _mm_storeu_ps( eta_pt+10, eta_hi2 ); - _mm_storeu_ps( eta_pt+12, eta_lo3 ); - _mm_storeu_ps( eta_pt+16, eta_lo4 ); - _mm_storeu_ps( eta_pt+18, eta_hi3 ); - _mm_storeu_ps( eta_pt+22, eta_hi4 ); - _mm_storeu_ps( eta_pt+24, eta2_lo[0] ); - _mm_storeu_ps( eta_pt+26, eta2_hi[0] ); - _mm_storeu_ps( eta_pt+30, eta2_lo[1] ); - _mm_storeu_ps( eta_pt+32, eta2_hi[1] ); - _mm_storeu_ps( eta_pt+36, eta2_lo[2] ); - _mm_storeu_ps( eta_pt+38, eta2_hi[2] ); - _mm_storeu_ps( eta_pt+42, eta2_lo[3] ); - _mm_storeu_ps( eta_pt+44, eta2_hi[3] ); - - eta_pt += 48; - } - -} - - -void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_pZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_pY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_pX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_plus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_pT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_pZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_pY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_pX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_plus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void block_oddeven_nplus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_npT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_npZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_npY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_npX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_nplus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_npT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_npZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_npY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_npX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_nplus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void block_oddeven_minus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_mT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_mZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_mY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_mX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_minus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_mT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_mZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_mY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_mX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_minus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void block_oddeven_nminus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_nmT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nmZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nmY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nmX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nminus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_nmT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_nmZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_nmY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_nmX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_nminus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_nminus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_nmT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nmZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nmY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nmX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nminus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_nmT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_nmZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_nmY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_nmX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_nminus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_nplus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_npT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_npZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_npY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_npX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nplus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_npT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_npZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_npY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_npX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_nplus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_minus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_mT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_mZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_mY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_mX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_minus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_mT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_mZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_mY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_mX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_minus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_pZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_pY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_pX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_plus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_pT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_pZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_pY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_pX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_plus_coupling_float: invalid mu=%d\n", mu ); - } -} - - - - - -static inline int sse_clover_real_index( int i, int j ) { - return (i/SIMD_LENGTH_float)*12*SIMD_LENGTH_float + SIMD_LENGTH_float*j*2 + i%SIMD_LENGTH_float; -} - -static inline int sse_clover_imag_index( int i, int j ) { - return (i/SIMD_LENGTH_float)*12*SIMD_LENGTH_float + SIMD_LENGTH_float*(j*2+1) + i%SIMD_LENGTH_float; -} - -void sse_set_clover_double( double *out, complex_double *in ) { } - -void sse_set_clover_float( float *out, complex_float *in ) { - - int index; - float sign = 0.0; - for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { - for ( int j=0; j<6; j++ ) { - for ( int i=0; i i+k ) { - // upper triangle - index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1)); - sign = 1.0; - } else { - // lower triangle, j < i+k - index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1)); - sign = -1.0; - } - } else { - // i+k >= 6 - // second 6-by-6 matrix - if ( j > i+k-6 ) { - // upper triangle - index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1)); - sign = 1.0; - } else { - // j < i+k-6 - // lower triangle - index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1)); - sign = -1.0; - } - } - out[ sse_clover_real_index(i+k,j) ] = creal_float( (complex_float)in[index] ); - out[ sse_clover_imag_index(i+k,j) ] = sign*cimag_float( (complex_float)in[index] ); - } - } - } -} - -void sse_set_clover_doublet_double( double *out, complex_double *in ) { } - -void sse_set_clover_doublet_float( float *out, complex_float *in ) { - - int index, d; - float sign = 0.0; - for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { - for ( int j=0; j<6; j++ ) { - for ( int i=0; i i+k ) { - // upper triangle - index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1)); - sign = 1.0; - } else { - // lower triangle, j < i+k - index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1)); - sign = -1.0; - } - } else { - // i+k >= 6 - // second 6-by-6 matrix - if ( j > i+k-6 ) { - // upper triangle - index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1)); - sign = 1.0; - } else { - // j < i+k-6 - // lower triangle - index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1)); - sign = -1.0; - } - } - d=(i+k<6)?0:6; - out[ sse_clover_real_index(i+k+d,j) ] = creal_float( in[index] ); - out[ sse_clover_imag_index(i+k+d,j) ] = sign*cimag_float( in[index] ); - out[ sse_clover_real_index(i+k+d+6,j) ] = creal_float( in[index] ); - out[ sse_clover_imag_index(i+k+d+6,j) ] = sign*cimag_float( in[index] ); - } - } - } -} - -void sse_add_diagonal_clover_double( double *out, complex_double *diag ) { } - -void sse_add_diagonal_clover_float( float *out, complex_float *diag ) { - for ( int k=0; k<12; k++ ) { - out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] ); - out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] ); - } -} - -void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag ) { } - -void sse_add_diagonal_clover_doublet_float( float *out, complex_float *diag ) { - for ( int k=0; k<6; k++ ) { - out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] ); - out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] ); - out[ sse_clover_real_index(k+6,k%6) ] -= creal_float( diag[k] ); - out[ sse_clover_imag_index(k+6,k%6) ] -= cimag_float( diag[k] ); - } - for ( int k=6; k<12; k++ ) { - out[ sse_clover_real_index(k+6,k%6) ] += creal_float( diag[k] ); - out[ sse_clover_imag_index(k+6,k%6) ] += cimag_float( diag[k] ); - out[ sse_clover_real_index(k+12,k%6) ] -= creal_float( diag[k] ); - out[ sse_clover_imag_index(k+12,k%6) ] -= cimag_float( diag[k] ); - } -} - -void sse_site_clover_double( double *eta, const double *phi, const double *clover ) { - -} - -void sse_site_clover_float( float *eta, const float *phi, float *clover ) { - - __m128 in_re; - __m128 in_im; - - __m128 clov_re; - __m128 clov_im; - - __m128 out_re; - __m128 out_im; - -#ifdef HAVE_TM1p1 - if( g.n_flavours == 2 ) { - // lines 1--4; indeces from 0 to 47 - in_re = _mm_set1_ps( phi[0] ); - in_im = _mm_set1_ps( phi[1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i] ); - in_im = _mm_set1_ps( phi[2*i+1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta ); - - // lines 5--8; indeces from 48 to 95 - in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] ); - in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] ); - in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+8 ); - - // lines 9--12; indeces from 96 to 143 - in_re = _mm_set1_ps( phi[12] ); - in_im = _mm_set1_ps( phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+12] ); - in_im = _mm_set1_ps( phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+16 ); - - // lines 13--16; indeces from 144 to 191 - in_re = _mm_set1_ps( phi[24] ); - in_im = _mm_set1_ps( phi[25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+24] ); - in_im = _mm_set1_ps( phi[2*i+25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+24 ); - - // lines 17--20; indeces from 192 to 239 - in_re = _mm_setr_ps( phi[24], phi[24], phi[36], phi[36] ); - in_im = _mm_setr_ps( phi[25], phi[25], phi[37], phi[37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_setr_ps( phi[2*i+24], phi[2*i+24], phi[2*i+36], phi[2*i+36] ); - in_im = _mm_setr_ps( phi[2*i+25], phi[2*i+25], phi[2*i+37], phi[2*i+37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+32 ); - - // lines 21--24; indeces from 240 to 287 - in_re = _mm_set1_ps( phi[36] ); - in_im = _mm_set1_ps( phi[37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+36] ); - in_im = _mm_set1_ps( phi[2*i+37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+40 ); - - } else { -#endif - // lines 1--4; indeces from 0 to 47 - in_re = _mm_set1_ps( phi[0] ); - in_im = _mm_set1_ps( phi[1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i] ); - in_im = _mm_set1_ps( phi[2*i+1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta ); - - // lines 5--8; indeces from 48 to 95 - in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] ); - in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] ); - in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+8 ); - - // lines 9--12; indeces from 96 to 143 - in_re = _mm_set1_ps( phi[12] ); - in_im = _mm_set1_ps( phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+12] ); - in_im = _mm_set1_ps( phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+16 ); -#ifdef HAVE_TM1p1 - } -#endif - -} - -void sse_site_clover_doublet_double( double *eta, const double *phi, const double *clover ) { - -} - -void sse_site_clover_doublet_float( float *eta, const float *phi, float *clover ) { - - __m128 in_re; - __m128 in_im; - - __m128 clov_re; - __m128 clov_im; - - __m128 out_re; - __m128 out_im; - - // lines 1--4; indeces from 0 to 47 - // lines 5--8; indeces from 48 to 95 - // lines 9--12; indeces from 96 to 143 - for( int n=0; n<3; n++ ) { - in_re = _mm_set1_ps( phi[0] ); - in_im = _mm_set1_ps( phi[1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<12; i++ ) { - in_re = _mm_set1_ps( phi[2*i] ); - in_im = _mm_set1_ps( phi[2*i+1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta + n*8 ); - } - - - // lines 13--16; indeces from 144 to 191 - // lines 17--20; indeces from 192 to 239 - // lines 21--24; indeces from 240 to 287 - for( int n=3; n<6; n++ ) { - in_re = _mm_set1_ps( phi[24] ); - in_im = _mm_set1_ps( phi[25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<12; i++ ) { - in_re = _mm_set1_ps( phi[2*i+24] ); - in_im = _mm_set1_ps( phi[2*i+25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta + n*8 ); - } -} - - - -void sse_site_clover_invert_double( double *clover_in, double *clover_out ) { } - -void sse_site_clover_invert_float( float *clover_in, float *clover_out ) { - - float M_tmp1[72], M_tmp2[72]; - - for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { - for ( int j=0; j<6; j++ ) { - for ( int i=k; i -#include - -// res = a*b + c -static inline __m128d sse_fmadd_pd( __m128d a, __m128d b, __m128d c ) { - __m128d res; - res = _mm_mul_pd( a, b ); - res = _mm_add_pd( res, c ); - return res; -} - -// res = -a*b + c -static inline __m128d sse_fnmadd_pd( __m128d a, __m128d b, __m128d c ) { - __m128d res; - res = _mm_mul_pd( a, b ); - res = _mm_sub_pd( c, res ); - return res; -} - -// res = a*b - c -static inline __m128d sse_fmsub_pd( __m128d a, __m128d b, __m128d c ) { - __m128d res; - res = _mm_mul_pd( a, b ); - res = _mm_sub_pd( res, c ); - return res; -} - -static inline double sse_reduce_add_pd( __m128d data ) { - double result; - data = _mm_add_pd( data, _mm_unpackhi_pd( data, data ) ); - _mm_store_sd( &result, data ); - return result; -} - -#endif -#endif \ No newline at end of file diff --git a/src/sse_float_intrinsic.h b/src/sse_float_intrinsic.h deleted file mode 100644 index 33220ba..0000000 --- a/src/sse_float_intrinsic.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef FLOAT_INTRINSIC_SSE_H -#define FLOAT_INTRINSIC_SSE_H - -#ifdef SSE -#include -#include - -// res = a*b + c -static inline __m128 sse_fmadd( __m128 a, __m128 b, __m128 c ) { - __m128 res; - res = _mm_mul_ps( a, b ); - res = _mm_add_ps( res, c ); - return res; -} - -// res = -a*b + c -static inline __m128 sse_fnmadd( __m128 a, __m128 b, __m128 c ) { - __m128 res; - res = _mm_mul_ps( a, b ); - res = _mm_sub_ps( c, res ); - return res; -} - -// res = a*b - c -static inline __m128 sse_fmsub( __m128 a, __m128 b, __m128 c ) { - __m128 res; - res = _mm_mul_ps( a, b ); - res = _mm_sub_ps( res, c ); - return res; -} - -// res = -a*b - c -static inline __m128 sse_fnmsub( __m128 a, __m128 b, __m128 c ) { - __m128 res; __m128 minus_a; - minus_a = _mm_setzero_ps(); - minus_a = _mm_sub_ps( minus_a, a ); - res = _mm_mul_ps( minus_a, b ); - res = _mm_sub_ps( res, c ); - return res; -} - -static inline void transpose_4_registers( __m128 *data) -{ - __m128 tmp[4]; - - tmp[0] = _mm_unpacklo_ps( data[0], data[1] ); - tmp[1] = _mm_unpacklo_ps( data[2], data[3] ); - tmp[2] = _mm_unpackhi_ps( data[0], data[1] ); - tmp[3] = _mm_unpackhi_ps( data[2], data[3] ); - - data[0] = _mm_movelh_ps( tmp[0], tmp[1] ); - data[1] = _mm_movehl_ps( tmp[1], tmp[0] ); - data[2] = _mm_movelh_ps( tmp[2], tmp[3] ); - data[3] = _mm_movehl_ps( tmp[3], tmp[2] ); -} - - -static inline float sse_reduce_add_ps( __m128 data ) { - float result; - - __m128 tmp; - tmp = _mm_add_ps( data, _mm_movehl_ps( data, data ) ); - data = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) ); - _mm_store_ss( &result, data ); - - return result; -} - -#endif - -#endif // FLOAT_INTRINSIC_SSE_H diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c deleted file mode 100644 index 876055f..0000000 --- a/src/sse_interpolation_generic.c +++ /dev/null @@ -1,672 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#if defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) - -void interpolation_PRECISION_alloc( level_struct *l ) { - - int k, n = l->num_eig_vect; - - MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); - -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n ); - for ( k=0; kis_PRECISION.interpolation[k]) ); - vector_PRECISION_alloc( &(l->is_PRECISION.interpolation[k]), _ORDINARY, 1, l, no_threading ); - } -#endif - // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size - MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, - ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 ); - - for ( k=0; kis_PRECISION.test_vector[k]) ); - vector_PRECISION_alloc( &(l->is_PRECISION.test_vector[k]), _INNER, 1, l, no_threading ); - } -} - - -void interpolation_PRECISION_dummy_alloc( level_struct *l ) { - - MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); - MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); -} - - -void interpolation_PRECISION_dummy_free( level_struct *l ) { - - FREE( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); - FREE( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); -} - - -void interpolation_PRECISION_free( level_struct *l ) { - - int n = l->num_eig_vect; - - for (int k=0; kis_PRECISION.test_vector[k]), l, no_threading ); - } - FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - for (int k=0; kis_PRECISION.interpolation[k]), l, no_threading ); - } - FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); -#endif - FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); -} - - -void swap8_PRECISION( PRECISION* data ) { - - int i; - PRECISION tmp[8]; - - for ( i=0; i<4; i++ ) { - tmp[i] = data[2*i]; - tmp[i+4] = data[2*i+1]; - } - - for ( i=0; i<8; i++ ) { - data[i] = tmp[i]; - } -} - - -void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading ) { - - int j, num_eig_vect = l->num_eig_vect; - complex_PRECISION *operator = l->is_PRECISION.operator; - - int start = threading->start_index[l->depth]; - int end = threading->end_index[l->depth]; - - SYNC_CORES(threading) - int offset = SIMD_LENGTH_PRECISION; - for ( j=0; j num_eig_vect) - j_end = num_eig_vect; - - operator = l->is_PRECISION.operator + j*l->vector_size + start*offset; - - for ( int i=start; iis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - -#ifdef HAVE_TM1p1 - if( g.n_flavours==2 ) - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; - float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi1_c_re+j, zero); - _mm_store_ps(tmp_phi1_c_im+j, zero); - _mm_store_ps(tmp_phi2_c_re+j, zero); - _mm_store_ps(tmp_phi2_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jvector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; - operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; - float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi_c_re+j, zero); - _mm_store_ps(tmp_phi_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jvector_buffer + i*2*num_parent_eig_vect*aggregate_sites; - operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - -#ifdef HAVE_TM1p1 - if( g.n_flavours==2 ) - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; - - float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi1_c_re+j, zero); - _mm_store_ps(tmp_phi1_c_im+j, zero); - _mm_store_ps(tmp_phi2_c_re+j, zero); - _mm_store_ps(tmp_phi2_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jvector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; - operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; - float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi_c_re+j, zero); - _mm_store_ps(tmp_phi_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jvector_buffer + i*2*num_parent_eig_vect*aggregate_sites; - operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; - -#ifdef HAVE_TM1p1 - if( g.n_flavours==2 ) - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - - int offset = SIMD_LENGTH_PRECISION; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; - - // loop over blocks of SIMD_LENGTH_PRECISION vectors - for ( j=0; jvector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; - operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; - - // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving - // complex components and masking - // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) - float tmp_phi1_c_re[2*offset]; - float tmp_phi1_c_im[2*offset]; - float tmp_phi2_c_re[2*offset]; - float tmp_phi2_c_im[2*offset]; - __m128 zero = _mm_setzero_ps(); - for ( k1=0; k1<2*offset; k1+=offset ) { - _mm_store_ps(tmp_phi1_c_re+k1, zero); - _mm_store_ps(tmp_phi1_c_im+k1, zero); - _mm_store_ps(tmp_phi2_c_re+k1, zero); - _mm_store_ps(tmp_phi2_c_im+k1, zero); - } - - for ( k=0; k broadcast - __m128 phi1_re = _mm_set1_ps(((float *)phi_pt)[0]); - __m128 phi1_im = _mm_set1_ps(((float *)phi_pt)[1]); - __m128 phi2_re = _mm_set1_ps(((float *)phi_pt)[0+2*num_parent_eig_vect]); - __m128 phi2_im = _mm_set1_ps(((float *)phi_pt)[1+2*num_parent_eig_vect]); - - __m128 operator_re = _mm_load_ps((float *)operator); - __m128 operator_im = _mm_load_ps((float *)operator+offset); - __m128 phi1_c_re = _mm_load_ps(tmp_phi1_c_re+low_high_offset); - __m128 phi1_c_im = _mm_load_ps(tmp_phi1_c_im+low_high_offset); - __m128 phi2_c_re = _mm_load_ps(tmp_phi2_c_re+low_high_offset); - __m128 phi2_c_im = _mm_load_ps(tmp_phi2_c_im+low_high_offset); - - cfmadd_conj(operator_re, operator_im, phi1_re, phi1_im, &phi1_c_re, &phi1_c_im); - cfmadd_conj(operator_re, operator_im, phi2_re, phi2_im, &phi2_c_re, &phi2_c_im); - - _mm_store_ps(tmp_phi1_c_re+low_high_offset, phi1_c_re); - _mm_store_ps(tmp_phi1_c_im+low_high_offset, phi1_c_im); - _mm_store_ps(tmp_phi2_c_re+low_high_offset, phi2_c_re); - _mm_store_ps(tmp_phi2_c_im+low_high_offset, phi2_c_im); - // skip to next real line of matrix - operator += offset; - phi_pt++; - } - phi_pt += num_parent_eig_vect; - low_high_offset = offset; - } - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+j+m))[0] = tmp_phi1_c_re[m]; - ((float*)(phi_c_pt+j+m))[1] = tmp_phi1_c_im[m]; - } - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi2_c_re[m]; - ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi2_c_im[m]; - } - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+2*num_eig_vect+j+m))[0] = tmp_phi1_c_re[m+offset]; - ((float*)(phi_c_pt+2*num_eig_vect+j+m))[1] = tmp_phi1_c_im[m+offset]; - } - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+3*num_eig_vect+j+m))[0] = tmp_phi2_c_re[m+offset]; - ((float*)(phi_c_pt+3*num_eig_vect+j+m))[1] = tmp_phi2_c_im[m+offset]; - } - } - } - else -#endif - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - - int offset = SIMD_LENGTH_PRECISION; - // loop over blocks of SIMD_LENGTH_PRECISION vectors - for ( j=0; jvector_buffer + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; - operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving - // complex components and masking - // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) - float tmp_phi_c_re[2*offset]; - float tmp_phi_c_im[2*offset]; - __m128 zero = _mm_setzero_ps(); - for ( k1=0; k1<2*offset; k1+=offset ) { - _mm_store_ps(tmp_phi_c_re+k1, zero); - _mm_store_ps(tmp_phi_c_im+k1, zero); - } - - for ( k=0; k broadcast - __m128 phi_re = _mm_set1_ps(((float *)phi_pt)[0]); - __m128 phi_im = _mm_set1_ps(((float *)phi_pt)[1]); - - __m128 operator_re = _mm_load_ps((float *)operator); - __m128 operator_im = _mm_load_ps((float *)operator+offset); - __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+low_high_offset); - __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+low_high_offset); - - cfmadd_conj(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im); - - _mm_store_ps(tmp_phi_c_re+low_high_offset, phi_c_re); - _mm_store_ps(tmp_phi_c_im+low_high_offset, phi_c_im); - // skip to next real line of matrix - operator += offset; - phi_pt++; - } - low_high_offset = offset; - } - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m]; - ((float*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m]; - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset]; - ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset]; - } - } - } - - SYNC_HYPERTHREADS(threading) - START_LOCKED_MASTER(threading) - vector_PRECISION_gather( phi_c, &(l->next_level->gs_PRECISION.transfer_buffer), l->next_level ); - END_LOCKED_MASTER(threading) - PROF_PRECISION_STOP( _PR, 1, threading ); -} - -#endif // defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) diff --git a/src/sse_interpolation_generic.h b/src/sse_interpolation_generic.h deleted file mode 100644 index 14eb693..0000000 --- a/src/sse_interpolation_generic.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_INTERPOLATION_PRECISION_HEADER - #define SSE_INTERPOLATION_PRECISION_HEADER - - #ifdef SSE - void interpolation_PRECISION_alloc( level_struct *l ); - void interpolation_PRECISION_free( level_struct *l ); - void interpolation_PRECISION_dummy_alloc( level_struct *l ); - void interpolation_PRECISION_dummy_free( level_struct *l ); - - void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, Thread *threading ); - void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_struct *l, Thread *threading ); -#endif - -#endif \ No newline at end of file diff --git a/src/sse_linalg.c b/src/sse_linalg.c deleted file mode 100644 index 1d0fc2b..0000000 --- a/src/sse_linalg.c +++ /dev/null @@ -1,795 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#ifdef OPTIMIZED_LINALG_double -void vector_double_scale( vector_double *z, vector_double *x, complex_double alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_double_START( _LA6 ); - - __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); - __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - double *zd = (double*)(z->vector_buffer+start); - double *xd = (double*)(x->vector_buffer+start); - - for( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_scale( vector_float *z, vector_float *x, complex_float alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_float_START( _LA6 ); - - __m128 alpha_re = _mm_set1_ps( creal_float(alpha) ); - __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) ); - float *zf = (float*)(z->vector_buffer+start); - float *xf = (float*)(x->vector_buffer+start); - - if ( l->depth == 0 ) { - for( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_saxpy( vector_float *z, vector_float *x, vector_float *y, complex_float alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_float_START( _LA8 ); - - __m128 alpha_re = _mm_set1_ps( creal_float(alpha) ); - __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) ); - - if ( l->depth == 0 ) { - for ( int i=start; ivector_buffer+i), &x_re, &x_im ); - sse_complex_deinterleaved_load( (float*)(y->vector_buffer+i), &y_re, &y_im ); - cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im); - sse_complex_interleaved_store( x_re, x_im, (float*)(z->vector_buffer+i) ); - i+=SIMD_LENGTH_float; - } - ) - } - } else { - for ( int i=start; ivector_buffer+i), &x_re, &x_im ); - sse_complex_deinterleaved_load( (float*)(y->vector_buffer+i), &y_re, &y_im ); - cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im); - sse_complex_interleaved_store( x_re, x_im, (float*)(z->vector_buffer+i) ); - i+=SIMD_LENGTH_float; - } - } - - if( thread == 0 && start != end ) - PROF_float_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void vector_double_saxpy( vector_double *z, vector_double *x, vector_double *y, complex_double alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_double_START( _LA8 ); - - __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); - __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - - for ( int i=start; ivector_buffer+i), &x_re, &x_im ); - sse_complex_deinterleaved_load_pd( (double*)(y->vector_buffer+i), &y_re, &y_im ); - cfmadd_pd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im); - sse_complex_interleaved_store_pd( x_re, x_im, (double*)(z->vector_buffer+i) ); - i+=SIMD_LENGTH_double; - } - ) - } - - if( thread == 0 && start != end ) - PROF_double_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -complex_double global_inner_product_double( vector_double *phi, vector_double *psi, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _GIP, threading ); - complex_double local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128d alpha_re = _mm_setzero_pd(); - __m128d alpha_im = _mm_setzero_pd(); - - if ( l->depth == 0 ) { - for( int i=thread_start; ivector_buffer+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load_pd( (double*)(psi->vector_buffer+i), &psi_re, &psi_im ); - cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im ); - i+=SIMD_LENGTH_double; - } - ) - } - } else { - for( int i=thread_start; ivector_buffer+i), &phi_re, &phi_im ); - sse_complex_deinterleaved_load_pd( (double*)(psi->vector_buffer+i), &psi_re, &psi_im ); - cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im ); - i+=SIMD_LENGTH_double; - } - } - - local_alpha = sse_reduce_add_pd( alpha_re ) + I* sse_reduce_add_pd( alpha_im ); - - // sum over cores - START_NO_HYPERTHREADS(threading) - ((complex_double *)threading->workspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_double *)threading->workspace)[0] += ((complex_double *)threading->workspace)[i]; - local_alpha = ((complex_double *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_double_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm ); - PROF_double_STOP( _ALLR, 1 ); - ((complex_double *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((complex_double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return global_alpha; - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((complex_double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return local_alpha; - } -} -#endif - -#ifdef OPTIMIZED_LINALG_float -complex_float global_inner_product_float( vector_float *phi, vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GIP, threading ); - complex_float local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128 alpha_re = _mm_setzero_ps(); - __m128 alpha_im = _mm_setzero_ps(); - - float *phif = (float*)(phi->vector_buffer+thread_start); - float *psif = (float*)(psi->vector_buffer+thread_start); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_float *)threading->workspace)[0] += ((complex_float *)threading->workspace)[i]; - local_alpha = ((complex_float *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_float_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); - PROF_float_STOP( _ALLR, 1 ); - ((complex_float *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((complex_float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return global_alpha; - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((complex_float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return local_alpha; - } -} -#endif - -#ifdef OPTIMIZED_LINALG_double -double global_norm_double( vector_double *x, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _GIP, threading ); - - double local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - VECTOR_FOR( int i=thread_start, ivector_buffer[i]), i++, l ); - - // sum over cores - START_NO_HYPERTHREADS(threading) - ((double *)threading->workspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((double *)threading->workspace)[0] += ((double *)threading->workspace)[i]; - local_alpha = ((double *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_double_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm ); - PROF_double_STOP( _ALLR, 1 ); - ((double *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (double)sqrt((double)global_alpha); - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (double)sqrt((double)local_alpha); - } -} -#endif - -#ifdef OPTIMIZED_LINALG_float -float global_norm_float( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GIP, threading ); - - float local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128 alpha = _mm_setzero_ps(); - - if ( l->depth == 0 ) { - for( int i=thread_start; ivector_buffer+i)); - alpha = sse_fmadd( phi, phi, alpha ); - i += 2; - } - ) - } - } else { - for( int i=thread_start; ivector_buffer+i)); - alpha = sse_fmadd( phi, phi, alpha ); - i += 2; - } - } - - local_alpha = sse_reduce_add_ps( alpha ); - - // sum over cores - START_NO_HYPERTHREADS(threading) - ((float *)threading->workspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((float *)threading->workspace)[0] += ((float *)threading->workspace)[i]; - local_alpha = ((float *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_float_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); - PROF_float_STOP( _ALLR, 1 ); - ((float *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (float)sqrt((double)global_alpha); - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (float)sqrt((double)local_alpha); - } -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_double *alpha, - int sign, int count, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_double_START( _LA8 ); - - int flag = 0; - __m128d alpha_re[count]; __m128d alpha_im[count]; - for ( int c=0; c EPS_double || -cimag_double(alpha[c]) > EPS_double ) - flag = 1; - } - - if ( flag == 0 ) { - for ( int c=0; cvector_buffer+i) ); - __m128d V_re = _mm_loadu_pd( (double*)(V[c].vector_buffer+i) ); - z_re = sse_fmadd_pd( alpha_re[c], V_re, z_re ); - _mm_storeu_pd( (double*)(z->vector_buffer+i), z_re ); - i++; - } - ) - } - } - } else { - for ( int c=0; cvector_buffer+i), &z_re, &z_im ); - sse_complex_deinterleaved_load_pd( (double*)(V[c].vector_buffer+i), &V_re, &V_im ); - cfmadd_pd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); - sse_complex_interleaved_store_pd( z_re, z_im, (double*)(z->vector_buffer+i) ); - i += SIMD_LENGTH_double; - } - ) - } - } - } - - if( thread == 0 && start != end ) - PROF_double_STOP( _LA8, (double)(count) ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float *alpha, - int sign, int count, int start, int end, level_struct *l ) { - - __m128 V_re; __m128 V_im; - __m128 z_re; __m128 z_im; - __m128 alpha_re[count]; __m128 alpha_im[count]; - int flag = 0; - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_float_START( _LA8 ); - - for ( int c=0; c EPS_float || -cimag_float(alpha[c]) > EPS_float ) - flag = 1; - } - - if ( l->depth == 0 ) { - if ( flag == 0 ) { - for ( int c=0; cvector_buffer+i) ); - V_re = _mm_loadu_ps( (float*)(V[c].vector_buffer+i) ); - z_re = sse_fmadd( alpha_re[c], V_re, z_re ); - _mm_storeu_ps( (float*)(z->vector_buffer+i), z_re ); - i+=2; - } - ) - } - } - } else { - for ( int c=0; cvector_buffer+i), &z_re, &z_im ); - sse_complex_deinterleaved_load( (float*)(V[c].vector_buffer+i), &V_re, &V_im ); - cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); - sse_complex_interleaved_store( z_re, z_im, (float*)(z->vector_buffer+i) ); - i+=SIMD_LENGTH_float; - } - ) - } - } - } - } else { - if ( flag == 0 ) { - for ( int c=0; cvector_buffer+i) ); - V_re = _mm_loadu_ps( (float*)(V[c].vector_buffer+i) ); - z_re = sse_fmadd( alpha_re[c], V_re, z_re ); - _mm_storeu_ps( (float*)(z->vector_buffer+i), z_re ); - i+=2; - } - } - } else { - for ( int c=0; cvector_buffer+i), &z_re, &z_im ); - sse_complex_deinterleaved_load( (float*)(V[c].vector_buffer+i), &V_re, &V_im ); - cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im); - sse_complex_interleaved_store( z_re, z_im, (float*)(z->vector_buffer+i) ); - i+=SIMD_LENGTH_float; - } - } - } - } - - if( thread == 0 && start != end ) - PROF_float_STOP( _LA8, (double)(count) ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, - vector_float *psi, int start, int end, level_struct *l, - struct Thread *threading ) { - - PROF_float_START( _PIP, threading ); - int i; - for(int c=0; cvector_buffer+i), &psi_re, &psi_im ); - sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i), &phi_re, &phi_im ); - - cmul_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - - sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i+4), &psi_re, &psi_im ); - sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i+4), &phi_re, &phi_im ); - - cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - - sse_complex_deinterleaved_load( (float*)(psi->vector_buffer+i+8), &psi_re, &psi_im ); - sse_complex_deinterleaved_load( (float*)(phi[c].vector_buffer+i+8), &phi_re, &phi_im ); - - cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - - results[c] += sse_reduce_add_ps(result_re) + I* sse_reduce_add_ps(result_im); - } - } - - START_NO_HYPERTHREADS(threading) - ((complex_double **)threading->workspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void process_multi_inner_product_float( int count, complex_float *results, vector_float *phi, vector_float *psi, - int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _PIP, threading ); - int i; - for(int c=0; cdepth == 0 ) { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); - for(int c=0; cvector_buffer+i), &psi_re, &psi_im ); - - cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - i+=SIMD_LENGTH_float; - } - ) - } - results[c] += sse_reduce_add_ps(result_re) + I*sse_reduce_add_ps(result_im); - } - } else { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 4); - for(int c=0; cvector_buffer+i), &psi_re, &psi_im ); - - cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im); - } - results[c] += sse_reduce_add_ps(result_re) + I*sse_reduce_add_ps(result_im); - } - } - - START_NO_HYPERTHREADS(threading) - ((complex_float **)threading->workspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_float **)threading->workspace)[0][c] += ((complex_float **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_float_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void process_multi_inner_product_double( int count, complex_double *results, vector_double *phi, vector_double *psi, - int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _PIP, threading ); - int i; - for(int c=0; cdepth == 0 ) { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); - for(int c=0; cvector_buffer+i), &pdi_re, &pdi_im ); - - cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im); - i+=SIMD_LENGTH_double; - } - ) - } - results[c] += sse_reduce_add_pd(result_re) + I*sse_reduce_add_pd(result_im); - } - } else { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2); - for(int c=0; cvector_buffer+i), &pdi_re, &pdi_im ); - - cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im); - } - results[c] += sse_reduce_add_pd(result_re) + I*sse_reduce_add_pd(result_im); - } - } - - START_NO_HYPERTHREADS(threading) - ((complex_double **)threading->workspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_double_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading ); -} -#endif - -#endif // SSE - diff --git a/src/sse_linalg.h b/src/sse_linalg.h deleted file mode 100644 index cd88fad..0000000 --- a/src/sse_linalg.h +++ /dev/null @@ -1,497 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef LINALG_SSE_H -#define LINALG_SSE_H -#ifdef SSE - - -// Standard Gram-Schmidt on aggregates -static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, - level_struct *l, struct Thread *threading ); -// Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt -static inline void sse_aggregate_gram_schmidt_block_float( float *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); -// used by Block-Gram-Schmidt -static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); -// used by Block-Gram-Schmidt -static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - -static inline void sse_aggregate_gram_schmidt_double( complex_double *V, const int num_vec, - level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_gram_schmidt_block_double( double *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_block_dot_block_double( double *S, double *U, double *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_block_minus_block_times_dot_double( double *B, double *U, double *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} - - -static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading ); - SYNC_CORES(threading) - SYNC_HYPERTHREADS(threading) - long int i, j, k, k1, k2, k3, num_aggregates = l->s_float.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - float *v_pt1; - float *v_pt2; - float norm1, norm2; - float next_norm1; - float next_norm2; - int ldv = SIMD_LENGTH_float; - int V_block_offset = 2*l->vector_size; - - for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { - - v_pt1 = (float *)V + 0 + j*aggregate_size*2*ldv; - - next_norm1 = 0.0; - next_norm2 = 0.0; - for ( i=0; is_float.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - float *v_pt1; - float *v_pt2; - float norm; - float next_norm; - int ldv = leading_dimension; - //offset = 6; - - - // current thread chooses an aggregate - for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) { - j = jp/2; - int component = jp%2; - - - v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv; - - next_norm = 0.0; - - // for the whole aggregate - for ( i=0; is_float.num_aggregates; - int aggregate_size = l->inner_vector_size / num_aggregates; - int offset = l->num_lattice_site_var/2; - - for ( int jp=threading->core; jpn_core ) { - int j = jp/2; - int component = jp%2; - // factors 2 are for complex and spin01/23 aggregates - Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - __m128 U_re; - __m128 U_im; - __m128 B_re; - __m128 B_im; - __m128 S_re[SIMD_LENGTH_float]; - __m128 S_im[SIMD_LENGTH_float]; - for( int i=0; is_float.num_aggregates; - int aggregate_size = l->inner_vector_size / num_aggregates; - int offset = l->num_lattice_site_var/2; - - for ( int jp=threading->core; jpn_core ) { - int j = jp/2; - int component = jp%2; - // factors 2 are for complex and spin01/23 aggregates - Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - __m128 U_re; - __m128 U_im; - __m128 B_re; - __m128 B_im; - __m128 S_re[SIMD_LENGTH_float]; - __m128 S_im[SIMD_LENGTH_float]; - for( int i=0; ivector_size), - (PRECISION *)(V + j*l->vector_size), vecs, l, threading ); - aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading ); - } - SYNC_CORES(threading) - PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading ); -} - - -void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) { - - // the block version has some optimizations which are correct only on the fine grid - if(l->depth == 0) - aggregate_block_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading); - else - aggregate_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading); -} - - -void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, int num_vec, level_struct *l, struct Thread *threading ) { - START_NO_HYPERTHREADS(threading) - - PRECISION *S = NULL; - START_LOCKED_MASTER(threading) - // factors 2 are for complex and spin01/23 aggregates - MALLOC_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64); - ((PRECISION **)threading->workspace)[0] = S; - END_LOCKED_MASTER(threading) - S = ((PRECISION **)threading->workspace)[0]; - - aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading); - aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading); - - START_LOCKED_MASTER(threading) - FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION); - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -} - - -void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) { - sse_aggregate_block_dot_block_PRECISION( S, U, B, num_vec, leading_dimension, l, threading ); -} - - -void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) { - sse_aggregate_block_minus_block_times_dot_PRECISION( B, U, S, num_vec, leading_dimension, l, threading ); -} - -#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION -void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*16; - - for(int i=0; i<2*offset; i++) - thread_buffer[i] = 0.0; - - SYNC_CORES(threading) - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - __m128 dot_re[count]; - __m128 dot_im[count]; - __m128 dot_gamma5_re[count]; - __m128 dot_gamma5_im[count]; - - for ( int j=0; j can use 3 pre-defined +/-1 patterns - __m128 gamma5[3]; - gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 ); - gamma5[1] = _mm_set_ps( 1.0, 1.0,-1.0,-1.0 ); - gamma5[2] = _mm_set_ps( 1.0, 1.0, 1.0, 1.0 ); - - for(int m=0; m<3; m++) { - - sse_complex_deinterleaved_load( (float*)(V[j].vector_buffer+i+k+4*m), &vj_re, &vj_im ); - sse_complex_deinterleaved_load( (float*)(V[count].vector_buffer+i+k+4*m), &v_re, &v_im ); - - gamma5_v_re = _mm_mul_ps(gamma5[m], v_re); - gamma5_v_im = _mm_mul_ps(gamma5[m], v_im); - - cfmadd_conj(vj_re, vj_im, v_re, v_im, dot_re+j, dot_im+j); - cfmadd_conj(vj_re, vj_im, gamma5_v_re, gamma5_v_im, dot_gamma5_re+j, dot_gamma5_im+j); - } - } - } - } - for ( int j=0; jworkspace)[threading->core] = thread_buffer; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) { - for(int j=0; jworkspace)[0][j] += ((complex_PRECISION **)threading->workspace)[i][j]; - ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset]; - } - } - END_MASTER(threading) - // only master needs the result in this case (it will be distributed later) -} -#endif - -#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION -void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*16; - - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - __m128 dot_re[count]; - __m128 dot_im[count]; - __m128 dot_gamma5_re[count]; - __m128 dot_gamma5_im[count]; - - for ( int j=0; j can use 3 pre-defined +/-1 patterns - __m128 gamma5[3]; - gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 ); - gamma5[1] = _mm_set_ps( 1.0, 1.0,-1.0,-1.0 ); - gamma5[2] = _mm_set_ps( 1.0, 1.0, 1.0, 1.0 ); - - for(int m=0; m<3; m++) { - - sse_complex_deinterleaved_load( (float*)(V[j].vector_buffer+i+k+4*m), &vj_re, &vj_im ); - sse_complex_deinterleaved_load( (float*)(V[count].vector_buffer+i+k+4*m), &v_re, &v_im ); - - gamma5_vj_re = _mm_mul_ps(gamma5[m], vj_re); - gamma5_vj_im = _mm_mul_ps(gamma5[m], vj_im); - - cfnmadd(vj_re, vj_im, dot_re[j], dot_im[j], &v_re, &v_im); - cfnmadd(gamma5_vj_re, gamma5_vj_im, dot_gamma5_re[j], dot_gamma5_im[j], &v_re, &v_im); - - sse_complex_interleaved_store(v_re, v_im, (float*)(V[count].vector_buffer+i+k+4*m) ); - } - } - } - } -} -#endif - -#endif diff --git a/src/sse_linalg_generic.h b/src/sse_linalg_generic.h deleted file mode 100644 index e29733a..0000000 --- a/src/sse_linalg_generic.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_LINALG_PRECISION_HEADER - #define SSE_LINALG_PRECISION_HEADER - #ifdef SSE - - void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - // Block-Gram-Schmidt on aggregates - void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - // Standard Gram-Schmidt on aggregates - void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - - // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt - void aggregate_gram_schmidt_block_PRECISION( PRECISION *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, - int num_vec, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - - void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading); - - void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading); - -#endif -#endif diff --git a/src/vectorization_control.h b/src/vectorization_control.h deleted file mode 100644 index f05a701..0000000 --- a/src/vectorization_control.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef VECTORIZATION_CONTROL_H -#define VECTORIZATION_CONTROL_H - -#ifdef SSE - -#define SIMD_LENGTH_float 4 -#define SIMD_LENGTH_double 2 - -#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float -#define OPTIMIZED_COARSE_SELF_COUPLING_float -#define INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_float -#define INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_float -#define OPTIMIZED_NEIGHBOR_COUPLING_double -#define OPTIMIZED_NEIGHBOR_COUPLING_float -#define OPTIMIZED_SELF_COUPLING_float -#define GRAM_SCHMIDT_VECTORIZED_float -#define OPTIMIZED_LINALG_float -#define OPTIMIZED_LINALG_double - -#include "sse_complex_float_intrinsic.h" -#include "sse_complex_double_intrinsic.h" - -#endif - -#define OPERATOR_COMPONENT_OFFSET_float (SIMD_LENGTH_float *((l->num_eig_vect+SIMD_LENGTH_float -1)/SIMD_LENGTH_float )) -#define OPERATOR_COMPONENT_OFFSET_double (SIMD_LENGTH_double*((l->num_eig_vect+SIMD_LENGTH_double-1)/SIMD_LENGTH_double)) - -#define OPERATOR_TYPE_float float -#define OPERATOR_TYPE_double double - -#endif // VECTORIZATION_CONTROL_H diff --git a/src/vectorization_dirac_generic.c b/src/vectorization_dirac_generic.c deleted file mode 100644 index 9ea2b3e..0000000 --- a/src/vectorization_dirac_generic.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE -void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = 12*offset; - int index_out; - int index_bw; - int index_fw; - int *neighbor = s->op.neighbor_table; - int *backward_neighbor = s->op.backward_neighbor_table; - complex_PRECISION *phi_pt; - complex_PRECISION buffer1[site_offset] __attribute__((aligned(64))); - complex_PRECISION buffer2[site_offset] __attribute__((aligned(64))); - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - - // add clover term/shift - spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset ); - spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset ); - - index_out = site; - - for(int mu=0; mu<4; mu++) { - index_fw = neighbor[4*index_out + mu]; - index_bw = backward_neighbor[4*index_out + mu]; - - // from backward - if ( direction_flags[2*mu+0] == 1 ) { - D_pt = D + 36*index_bw+9*mu; - phi_pt = phi + site_offset*index_bw; - mvmh_PRECISION_vectorized( buffer2+0*offset, D_pt, phi_pt+0*offset, offset ); - mvmh_PRECISION_vectorized( buffer2+3*offset, D_pt, phi_pt+3*offset, offset ); - mvmh_PRECISION_vectorized( buffer2+6*offset, D_pt, phi_pt+6*offset, offset ); - mvmh_PRECISION_vectorized( buffer2+9*offset, D_pt, phi_pt+9*offset, offset ); - twospin_PRECISION_vectorized( eta1, eta2, buffer2, offset, mu, -1.0 ); - } - - // from forward - if ( direction_flags[2*mu+1] == 1 ) { - D_pt = D + 36*index_out+9*mu; - phi_pt = phi + site_offset*index_fw; - mvm_PRECISION_vectorized( buffer1+0*offset, D_pt, phi_pt+0*offset, offset ); - mvm_PRECISION_vectorized( buffer1+3*offset, D_pt, phi_pt+3*offset, offset ); - mvm_PRECISION_vectorized( buffer1+6*offset, D_pt, phi_pt+6*offset, offset ); - mvm_PRECISION_vectorized( buffer1+9*offset, D_pt, phi_pt+9*offset, offset ); - twospin_PRECISION_vectorized( eta1, eta2, buffer1, offset, mu, 1.0 ); - } - } -} -#endif - -#ifdef SSE -void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = 12*offset; - int index_out; - int index_fw; - int *neighbor = s->op.neighbor_table; - complex_PRECISION *phi_pt; - complex_PRECISION buffer[site_offset] __attribute__((aligned(64))); - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - - index_out = site; - - // requires the positive boundaries of phi to be communicated befor - index_fw = neighbor[4*index_out + mu]; - D_pt = D + 36*index_out+9*mu; - phi_pt = phi + site_offset*index_fw; - mvm_PRECISION_vectorized_simd_length( buffer+0*offset, D_pt, phi_pt+0*offset ); - mvm_PRECISION_vectorized_simd_length( buffer+3*offset, D_pt, phi_pt+3*offset ); - mvm_PRECISION_vectorized_simd_length( buffer+6*offset, D_pt, phi_pt+6*offset ); - mvm_PRECISION_vectorized_simd_length( buffer+9*offset, D_pt, phi_pt+9*offset ); - twospin2_p_PRECISION_vectorized_simd_length( eta1, eta2, buffer, mu ); -} -#endif - -#ifdef SSE -void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = 12*offset; - - sse_diagonal_aggregate_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+12*site, offset ); -} -#endif diff --git a/src/vectorization_dirac_generic.h b/src/vectorization_dirac_generic.h deleted file mode 100644 index 5b8f02c..0000000 --- a/src/vectorization_dirac_generic.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef VECTORIZATION_DIRAC_PRECISION_HEADER - #define VECTORIZATION_DIRAC_PRECISION_HEADER - -#ifdef SSE - #include "sse_dirac.h" -#endif - - // caller is responsibel for checking that he needs coupling in this direction for this site - void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ); - - void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ); - - void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site ); - - // spinors are vectorized, gauge is same for all (use for multiple rhs) - static inline void mvm_PRECISION_vectorized_simd_length( - const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi ) { -#ifdef SSE - sse_mvm_PRECISION_simd_length( eta, D, phi ); -#endif - - } - // spinors are vectorized, gauge is same for all (use for multiple rhs) - static inline void mvm_PRECISION_vectorized( - const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi, int elements ) { -#ifdef SSE - sse_mvm_PRECISION( eta, D, phi, elements ); -#endif - } - - // spinors are vectorized, gauge is same for all (use for multiple rhs) - static inline void mvmh_PRECISION_vectorized( - const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi, int elements ) { -#ifdef SSE - sse_mvmh_PRECISION( eta, D, phi, elements ); -#endif - } - - // mu is according to the enum for T,Z,Y,X defined in clifford.h - static inline void twospin_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements, int mu, double sign ) { -#ifdef SSE - sse_twospin_PRECISION( out_spin0and1, out_spin2and3, in, elements, mu, sign ); -#endif - } - static inline void twospin_p_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T, 1.0); - } - static inline void twospin_n_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T, -1.0); - } - static inline void twospin_p_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z, 1.0); - } - static inline void twospin_n_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z, -1.0); - } - static inline void twospin_p_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y, 1.0); - } - static inline void twospin_n_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y, -1.0); - } - static inline void twospin_p_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X, 1.0); - } - static inline void twospin_n_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X, -1.0); - } - - // mu is according to the enum for T,Z,Y,X defined in clifford.h - static inline void twospin2_p_PRECISION_vectorized_simd_length( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int mu ) { -#ifdef SSE - sse_twospin2_p_PRECISION_simd_length( out_spin0and1, out_spin2and3, in, mu ); -#endif - } - // mu is according to the enum for T,Z,Y,X defined in clifford.h - static inline void twospin2_p_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements, int mu ) { -#ifdef SSE - sse_twospin2_p_PRECISION( out_spin0and1, out_spin2and3, in, elements, mu ); -#endif - } - static inline void twospin2_p_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T); - } - static inline void twospin2_p_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z); - } - static inline void twospin2_p_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y); - } - static inline void twospin2_p_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X); - } - - static inline void spin0and1_site_clover_PRECISION_vectorized( const complex_PRECISION *eta, const complex_PRECISION *phi, - const config_PRECISION clover, double shift, int elements ) { -#ifdef SSE - sse_spin0and1_site_clover_PRECISION( eta, phi, clover, shift, elements ); -#endif - } - - static inline void spin2and3_site_clover_PRECISION_vectorized( const complex_PRECISION *eta, const complex_PRECISION *phi, - const config_PRECISION clover, double shift, int elements ) { -#ifdef SSE - sse_spin2and3_site_clover_PRECISION( eta, phi, clover, shift, elements ); -#endif - } - -#endif From bb457afb0d66efae9f5a9e72c694de70f52bdd1d Mon Sep 17 00:00:00 2001 From: Marc Illa Date: Tue, 11 Dec 2018 12:02:06 +0200 Subject: [PATCH 31/31] VECTOR_LOOP macro implemented --- src/dirac_generic.c | 44 +-- src/dirac_generic.h | 626 +++++++++++++++++------------------------ src/linalg.c | 41 +-- src/linalg_generic.c | 103 ++----- src/linsolve.c | 131 ++------- src/linsolve_generic.c | 147 +++------- src/main.h | 3 +- src/vector_generic.c | 48 +--- 8 files changed, 365 insertions(+), 778 deletions(-) diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 026b171..ab420ff 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -150,7 +150,7 @@ void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PR void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ) { - int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j, k; + int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j, jj; buffer_PRECISION lphi = phi->vector_buffer+start*n_vect, leta = eta->vector_buffer+start*n_vect; buffer_PRECISION leta_end = eta->vector_buffer+end*n_vect; #ifdef PROFILING @@ -195,33 +195,21 @@ void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operato if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { while ( leta < leta_end ) for( i=0; i<12; i++ ) { - for( j=0; jnum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( k=0; knum_vect+j+k] = 0.0; + VECTOR_LOOP(j, psi->num_vect, jj, results[c*psi->num_vect+j+jj] = 0.0;) for(int c=0; cnum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect+j+k] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j+k])*psi->vector_buffer[i*psi->num_vect+j+k]; - + VECTOR_LOOP(j, psi->num_vect, jj, results[c*psi->num_vect+j+jj] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j+jj])*psi->vector_buffer[i*psi->num_vect+j+jj];) + if(thread == 0 && start != end) PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } @@ -147,25 +138,13 @@ void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct T if(thread == 0 && start != end) PROF_float_START( _GIP, threading ); - int i, j, k; - for( j=0; jnum_vect; j+=num_loop ) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect, jj, res[j+jj]=0;) + for( i=start; inum_vect; j+=num_loop ) - #pragma unroll - #pragma vector aligned - for( k=0; kvector_buffer[i*x->num_vect+j+k]); - - for( j=0; jnum_vect; j+=num_loop ) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect, jj, res[j+jj] += NORM_SQUARE_float(x->vector_buffer[i*x->num_vect+j+jj]);) + + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] = (double)sqrt((double)res[j+jj]);) if(thread == 0 && start != end) PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); diff --git a/src/linalg_generic.c b/src/linalg_generic.c index d0ecaf1..22f520f 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -157,22 +157,12 @@ void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *re if(thread == 0 && start != end) PROF_PRECISION_START( _PIP, threading ); - int i, j, k; - /*for(int c=0; cnum_vect; c+=num_loop) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect, results[j] = 0.0;) + int i, j, jj; + VECTOR_LOOP(j, count*psi->num_vect, jj, results[j+jj] = 0.0;) for(int c=0; cnum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect+j+k] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+k])*psi->vector_buffer[i*psi->num_vect+j+k];*/ - vector_loop(j, psi->num_vect, results[c*psi->num_vect+j] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j])*psi->vector_buffer[i*psi->num_vect+j];) + VECTOR_LOOP(j, psi->num_vect, jj, results[c*psi->num_vect+j+jj] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+jj])*psi->vector_buffer[i*psi->num_vect+j+jj];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); @@ -275,26 +265,13 @@ void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struc if(thread == 0 && start != end) PROF_PRECISION_START( _GIP, threading ); - int i, j, k; - /*for( j=0; jnum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect, res[j]=0;) + int i, j, jj; + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj]=0;) for( i=start; inum_vect; j+=num_loop) - #pragma unroll - for( k=0; kvector_buffer[i*x->num_vect+j+k]);*/ - vector_loop(j, x->num_vect, res[j] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j]);) - - /*for( j=0; jnum_vect; j+=num_loop) - #pragma unroll - for( k=0; knum_vect, res[j] = (PRECISION)sqrt((double)res[j]);) + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j+jj]);) + + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] = (PRECISION)sqrt((double)res[j+jj]);) if(thread == 0 && start != end) PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); @@ -316,20 +293,14 @@ void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRE void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { - int i, j, k, start, end; + int i, j, jj, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] + y->vector_buffer[i*x->num_vect+j+k];*/ - vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + y->vector_buffer[i*x->num_vect+j];) + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] + y->vector_buffer[i*x->num_vect+j+jj];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -358,13 +329,7 @@ void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vecto PROF_PRECISION_START( _LA2 ); for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k] - y->vector_buffer[i*x->num_vect+j+k];*/ - vector_loop2(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - y->vector_buffer[i*x->num_vect+j+jj];) + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - y->vector_buffer[i*x->num_vect+j+jj];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); @@ -384,20 +349,14 @@ void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_P void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ) { - int i, j, n, start, end; + int i, j, jj, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( n=0; nvector_buffer[i*x->num_vect+j+n] = alpha[k*x->num_vect+j+n]*x->vector_buffer[i*x->num_vect+j+n];*/ - vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = alpha[k*x->num_vect+j]*x->vector_buffer[i*x->num_vect+j];) + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = alpha[k*x->num_vect+j+jj]*x->vector_buffer[i*x->num_vect+j+jj];) if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); @@ -450,7 +409,7 @@ void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PR // else: minus void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ) { - int i, j, n, start, end; + int i, j, jj, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if (thread == 0 && start != end ) @@ -458,23 +417,10 @@ void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vecto if( sign == 1 ) for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( n=0; nvector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] + alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n];*/ - vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] + alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j];) + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] + alpha[k*x->num_vect+j+jj]*y->vector_buffer[i*x->num_vect+j+jj];) else for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( n=0; nvector_buffer[i*x->num_vect+j+n] = x->vector_buffer[i*x->num_vect+j+n] - alpha[k*x->num_vect+j+n]*y->vector_buffer[i*x->num_vect+j+n];*/ - vector_loop(j, x->num_vect, z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j] - alpha[k*x->num_vect+j]*y->vector_buffer[i*x->num_vect+j];) - + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - alpha[k*x->num_vect+j+jj]*y->vector_buffer[i*x->num_vect+j+jj];) if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); @@ -505,7 +451,7 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, com void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, int count, level_struct *l, struct Thread *threading ) { - int c, i, j, k, start, end; + int c, i, j, jj, start, end; compute_core_start_end(0, z->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if (thread == 0 && start != end ) @@ -513,22 +459,11 @@ void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION alpha_signed[count*z->num_vect]; for ( c=0; cnum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - for( k=0; knum_vect+j+k] = sign*alpha[c*z->num_vect+j+k];*/ - vector_loop(j, z->num_vect, alpha_signed[c*z->num_vect+j] = sign*alpha[c*z->num_vect+j];) + VECTOR_LOOP(j, z->num_vect, jj, alpha_signed[c*z->num_vect+j+jj] = sign*alpha[c*z->num_vect+j+jj];) for ( c=0; cnum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( k=0; kvector_buffer[i*z->num_vect+j+k] += V[c].vector_buffer[i*z->num_vect+j+k]*alpha_signed[c];*/ - vector_loop(j, z->num_vect, z->vector_buffer[i*z->num_vect+j] += V[c].vector_buffer[i*z->num_vect+j]*alpha_signed[c];) + VECTOR_LOOP(j, z->num_vect, jj, z->vector_buffer[i*z->num_vect+j+jj] += V[c].vector_buffer[i*z->num_vect+j+jj]*alpha_signed[c];) if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); diff --git a/src/linsolve.c b/src/linsolve.c index fe89e54..df5eca2 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -163,7 +163,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { int start; int end; - int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, k;//n_vec; + int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, jj; complex_double gamma0[n_vect];//gamma0=0; double beta[n_vect]; //beta=0; @@ -171,17 +171,9 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { double norm_r0[n_vect], gamma_jp1[n_vect], gamma0_real[n_vect], gamma_tot, H_tot, gamma_tot2;//norm_r0=1, gamma_jp1=1 complex_float gamma_float[n_vect]; - /*for( i=0; idepth==0 && ( p->dp.timing || p->dp.print ) ) prof_init( l ); @@ -211,18 +203,12 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { } //gamma0 = (complex_double) global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) global_norm_double_new( gamma0_real, &(p->dp.r), l, threading ); - for( i=0; idp.gamma[0] = gamma0; - for( i=0; idp.gamma[i+k] = gamma0[i+k]; + VECTOR_LOOP(i, n_vect, jj, p->dp.gamma[i+jj] = gamma0[i+jj];) + END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) @@ -234,10 +220,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { printf0("| initial guess relative residual (%d): %le |\n", i, creal(gamma0[i])/norm_r0[i]); } else { //norm_r0 = creal(gamma0); - for( i=0; isp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); //vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 - for( i=0; idp.gamma[0*n_vect+i+k]; + VECTOR_LOOP(i, n_vect, jj, gamma_float[i+jj]= (complex_float) p->dp.gamma[0*n_vect+i+jj];) vector_float_real_scale_new( &(p->sp.V[0]), &(p->sp.V[0]), gamma_float, 0, 1, l, threading ); // inner loop in single precision for( il=0; ildp.restart_length && finish==0; il++) { j = il; iter++; arnoldi_step_MP_new( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); H_tot=0; - for( i=0; idp.H[j][(j+1)*n_vect+i+k] ); + VECTOR_LOOP(i, n_vect, jj, H_tot += cabs( p->dp.H[j][(j+1)*n_vect+i+jj] );) //if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) if ( H_tot > n_vect*1E-15 ) { qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading ); //gamma_jp1 = cabs( p->dp.gamma[j+1] ); - for( i=0; idp.gamma[(j+1)*n_vect+i+k] ); + VECTOR_LOOP(i, n_vect, jj, gamma_jp1[i+jj] = cabs( p->dp.gamma[(j+1)*n_vect+i+jj] );) if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) { #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -290,10 +261,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { #endif } gamma_tot=0; - for( i=0; idp.tol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop if( gamma_tot < n_vect*p->dp.tol || gamma_tot > n_vect*1E+5 ) { @@ -303,10 +271,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { END_MASTER(threading) } gamma_tot2=0; - for( i=0; isp.tol ) if( gamma_tot2 < n_vect*p->sp.tol ){ break; @@ -340,10 +305,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { //beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); global_norm_double_new( beta, &(p->dp.r), l, threading ); #else - for( i=0; i 1 ) { PROF_double_START( _ALLR ); @@ -505,22 +463,14 @@ void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w, PROF_double_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) - for( n=0; n n_vect*1e-15 ){ - for( n=0; n=0; i-- ) { - for ( n=0; ndepth==0 && ( p->timing || p->print ) ) prof_init( l ); @@ -283,18 +281,12 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } //gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) global_norm_PRECISION_new( gamma0_real, &(p->r), l, threading ); - for( i=0; igamma[0] = gamma0; - for( i=0; igamma[i+k] = gamma0[i+k]; + VECTOR_LOOP(i, n_vect, jj, p->gamma[i+jj] = gamma0[i+jj];) END_MASTER(threading); SYNC_MASTER_TO_ALL(threading); @@ -307,10 +299,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread printf0("| initial guess relative residual (%d): %le |\n", i, creal(gamma0[i])/norm_r0[i]); } else { //norm_r0 = creal(p->gamma[0]); - for( i=0; igamma[i+k]); + VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj] = creal(p->gamma[i+jj]);) } } //vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 @@ -349,21 +338,13 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } #endif H_tot=0; - for( i=0; iH[j][(j+1)*n_vect+i+k] ); + VECTOR_LOOP(i, n_vect, jj, H_tot += cabs( p->H[j][(j+1)*n_vect+i+jj] );) //if ( cabs( p->H[j][j+1] ) > p->tol/10 ) if ( H_tot > n_vect*p->tol/10 ) { qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading ); //gamma_jp1 = cabs( p->gamma[(j+1)] ); - for( i=0; igamma[(j+1)*n_vect+i+k] ); + VECTOR_LOOP(i, n_vect, jj, gamma_jp1[i+jj] = cabs( p->gamma[(j+1)*n_vect+i+jj] );) #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%10 == 0 || p->preconditioner != NULL || l->depth > 0 ) { @@ -375,10 +356,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } #endif gamma_tot=0; - for( i=0; itol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop if( gamma_tot < n_vect*p->tol || gamma_tot > n_vect*1E+5 ) { @@ -412,18 +390,12 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread //beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); global_norm_PRECISION_new( beta, &(p->r), l, threading ); #else - for( i=0; i 0 ) printf0("+----------------------------------------------------------+\n\n"); #endif @@ -1117,7 +1089,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector #else SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) - int i, n_vect=g.num_rhs_vect, n, k; + int i, n_vect=g.num_rhs_vect, n, jj; PRECISION H_tot; // start and end indices for vector functions depending on thread int start, end; @@ -1151,11 +1123,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); START_MASTER(threading) for( i=0; i<=j; i++ ) - for( n=0; n 1 ) { PROF_PRECISION_START( _ALLR ); @@ -1163,11 +1131,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector PROF_PRECISION_STOP( _ALLR, 1 ); } else { for( i=0; i<=j; i++ ) - for( n=0; n 1 ) { PROF_PRECISION_START( _ALLR ); @@ -1191,10 +1152,7 @@ int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector } for( i=0; i<=j; i++ ) - for( n=0; nH[j][(j+1)*n_vect+i+k] ); + VECTOR_LOOP(n, n_vect, jj, H_tot += cabs_PRECISION( p->H[j][(j+1)*n_vect+n+jj] );) if ( H_tot > n_vect*1e-15 ) vector_PRECISION_real_scale_new( &V[j+1], w, H[j], j+1, 1, l, threading ); #endif @@ -1249,7 +1200,7 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, PROF_PRECISION_START( _SMALL1 ); - int i, n, k, n_vect=g.num_rhs_vect; + int i, n, jj, n_vect=g.num_rhs_vect; complex_PRECISION beta[n_vect]; // update QR factorization @@ -1262,41 +1213,16 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, } // compute current Givens rotation - for( n=0; n=0; i-- ) { - for ( n=0; nnum_vect]; if(opt){ - for( j=0; jnum_vect; j+=num_loop) - #pragma unroll - for( k=0; knum_vect+j+k]); + VECTOR_LOOP(j, x->num_vect, jj, r_alpha[j+jj]=1.0/creal_PRECISION(alpha[n*x->num_vect+j+jj]);) }else{ - for( j=0; jnum_vect; j+=num_loop) - #pragma unroll - for( k=0; knum_vect+j+k]); + VECTOR_LOOP(j, x->num_vect, jj, r_alpha[j+jj]=creal_PRECISION(alpha[n*x->num_vect+j+jj]);) } + compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _RS ); - - //PRECISION *restrict r_z = (PRECISION*)z->vector_buffer, *restrict r_x = (PRECISION*)x->vector_buffer; - - //for( i=start; inum_vect; j++) - // z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j]; //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); if(z == x){ for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - for( k=0; kvector_buffer[i*x->num_vect+j+k] *= r_alpha[j+k]; //*z->vector_buffer[i*x->num_vect+j+k]; + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] *= r_alpha[j+jj];) } else { - // PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer; for( i=start; inum_vect; j+=num_loop) - /* #pragma unroll - #pragma vector aligned - #pragma ivdep - for( k=0; knum_vect, z->vector_buffer[i*x->num_vect+j] = r_alpha[j]*x->vector_buffer[i*x->num_vect+j];) + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = r_alpha[j+jj]*x->vector_buffer[i*x->num_vect+j+jj];) } //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); @@ -203,25 +182,14 @@ void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_ if(z == x) return; - int i, j, k, start, end; - //PRECISION * restrict r_z = (PRECISION*)z->vector_buffer, * restrict r_x = (PRECISION*)x->vector_buffer; + int i, j, jj, start, end; compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0) PROF_PRECISION_START( _CPY ); - //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); - //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); - //for( i=start; inum_vect; j++) - // z->vector_buffer[i*x->num_vect+j] = x->vector_buffer[i*x->num_vect+j]; for( i=start; inum_vect; j+=num_loop) - #pragma unroll - #pragma vector aligned - #pragma ivdep - for( k=0; kvector_buffer[i*x->num_vect+j+k] = x->vector_buffer[i*x->num_vect+j+k]; + VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj];) //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading );