diff --git a/build/.gitignore b/build/.gitignore index 5761abc..35d987b 100644 --- a/build/.gitignore +++ b/build/.gitignore @@ -1 +1,4 @@ +* *.o +!gsrc +!.gitignore \ No newline at end of file diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c index 2bc3f82..da3b343 100644 --- a/src/DDalphaAMG_interface.c +++ b/src/DDalphaAMG_interface.c @@ -570,7 +570,7 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) { } } -static inline void vector_copy( vector_double vector_out, vector_double vector_in ) +static inline void vector_copy( vector_double *vector_out, vector_double *vector_in ) { THREADED(threading[0]->n_core) { int start = threading[omp_get_thread_num()]->start_index[0], @@ -591,7 +591,7 @@ static inline void solver( ) } } -static inline void correct_guess( vector_double guess, vector_double solution, vector_double solution2, +static inline void correct_guess( vector_double *guess, vector_double *solution, vector_double *solution2, double even_dshift, double odd_dshift ) { // guess = D^{-1}*rhs - i*dshift*D^{-2}*rhs @@ -666,8 +666,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d complex_double twisted_bc, tmp1, tmp2; double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2; gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); - vector_double vb=p->b, rhs = p->b; - vector_double vx=p->x, sol = p->x; + buffer_double vb=p->b.vector_buffer, vx=p->x.vector_buffer; + vector_double *rhs = &(p->b), *sol = &(p->x); DDalphaAMG_status tmp_status; double t0, t1; @@ -717,40 +717,41 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif if(p->initial_guess_zero == 0) { #ifndef BASIS4 - sol[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; - sol[j+6] = ((complex_double)vector2_out[i+2*(k+3*mu)] + I*(complex_double)vector2_out[i+2*(k+3*mu)+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; + sol->vector_buffer[j+6] = ((complex_double)vector2_out[i+2*(k+3*mu)] + I*(complex_double)vector2_out[i+2*(k+3*mu)+1]) * twisted_bc; #else - sol[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; - sol[j+6] = ((complex_double)vector2_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; + sol->vector_buffer[j+6] = ((complex_double)vector2_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif } #ifndef INIT_ONE_PREC if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); + vtmp=cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; - vtmp=cabs(rhs[j+6]); + vtmp=cabs(rhs->vector_buffer[j+6]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; } - } + #endif + } if(mu%2) j+=6; } @@ -759,30 +760,31 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif if(p->initial_guess_zero == 0) { #ifndef BASIS4 - sol[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc; #else - sol[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; + sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif } #ifndef INIT_ONE_PREC if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); + vtmp=cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; } - } + #endif + } } } } @@ -803,10 +805,10 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d g.mixed_precision=1; p = &(g.p); // storing pointer in x and b - vb = p->b; - vx = p->x; - p->b = g.p_MP.dp.b; - p->x = g.p_MP.dp.x; + vb = p->b.vector_buffer; + vx = p->x.vector_buffer; + p->b.vector_buffer = g.p_MP.dp.b.vector_buffer; + p->x.vector_buffer = g.p_MP.dp.x.vector_buffer; p->tol = g.p_MP.dp.tol; } else precision_changed = 0; #endif @@ -984,8 +986,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d if(g.n_flavours==2) { for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; - tmp2 = sol[j+6] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; + tmp2 = sol->vector_buffer[j+6] * twisted_bc; #ifndef BASIS4 vector1_out[i+2*(k+3*mu)] = creal(tmp1); vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1005,7 +1007,7 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d #endif for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; #ifndef BASIS4 vector1_out[i+2*(k+3*mu)] = creal(tmp1); vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1023,8 +1025,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d if (precision_changed) { g.mixed_precision=2; // recovering pointer from x and b - p->b = vb; - p->x = vx; + p->b.vector_buffer = vb; + p->x.vector_buffer = vx; } #endif @@ -1049,9 +1051,14 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2; gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); - vector_double vb, rhs = p->b; - vector_double vx, sol = p->x; - vector_double source = NULL, solution = NULL, solution2 = NULL; + buffer_double vb, vx; + vector_double *rhs =&(p->b), *sol = &(p->x); + vector_double source, solution, solution2; + + vector_double_init( &source ); + vector_double_init( &solution ); + vector_double_init( &solution2 ); + DDalphaAMG_status tmp_status; double t0, t1; @@ -1102,29 +1109,30 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; - rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif #ifndef INIT_ONE_PREC if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); + vtmp=cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; - vtmp=cabs(rhs[j+6]); + vtmp=cabs(rhs->vector_buffer[j+6]); if(vtmp > vmax) vmax=vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin=vtmp; } - } + #endif + } if(mu%2) j+=6; } @@ -1133,21 +1141,22 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif #ifndef INIT_ONE_PREC if( g.mixed_precision == 2 ) { - vtmp = cabs(rhs[j]); + vtmp = cabs(rhs->vector_buffer[j]); if(vtmp > vmax) vmax = vtmp; if( vtmp > EPS_double && vtmp < vmin ) vmin = vtmp; } - } + #endif + } } } } @@ -1168,8 +1177,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i g.mixed_precision=1; p = &(g.p); // storing pointer in x and b - vb = p->b; - vx = p->x; + vb = p->b.vector_buffer; + vx = p->x.vector_buffer; p->b = g.p_MP.dp.b; p->x = g.p_MP.dp.x; p->tol = g.p_MP.dp.tol; @@ -1181,10 +1190,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i ASSERT( odd_shifts != NULL ); } if ( n_shifts > 1 ) { - MALLOC( source, complex_double, l.inner_vector_size ); - MALLOC( solution, complex_double, l.inner_vector_size ); + vector_double_alloc( &source, _INNER, 1, &l, no_threading); + vector_double_alloc( &solution, _INNER, 1, &l, no_threading); if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) - MALLOC( solution2, complex_double, l.inner_vector_size ); + vector_double_alloc( &solution2, _INNER, 1, &l, no_threading); } for ( n = 0; n < n_shifts; n++ ) { @@ -1220,10 +1229,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) - vector_copy( source, rhs ); + vector_copy( &source, rhs ); solver( ); break; @@ -1231,7 +1240,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE_SQ : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) { THREADED(threading[0]->n_core) @@ -1243,18 +1252,18 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i #endif // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] ); - vector_copy( source, rhs ); + vector_copy( &source, rhs ); } if( n ) - correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); // read NOTE RESIDUAL THREADED(threading[0]->n_core) nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); p->tol = tol[n]/2.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution, sol ); + vector_copy( &solution, sol ); THREADED(threading[0]->n_core) #ifdef HAVE_TM1p1 @@ -1272,7 +1281,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i DDalphaAMG_change_mu_sign( &tmp_status ); if( n ) - vector_copy( sol, solution2 ); + vector_copy( sol, &solution2 ); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1280,7 +1289,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution2, sol ); + vector_copy( &solution2, sol ); // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); @@ -1289,7 +1298,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE_SQ_ODD : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) { THREADED(threading[0]->n_core) @@ -1302,11 +1311,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); - vector_copy( source, rhs ); + vector_copy( &source, rhs ); } if( n ) - correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1314,7 +1323,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = tol[n]/2.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution, sol ); + vector_copy( &solution, sol ); THREADED(threading[0]->n_core) #ifdef HAVE_TM1p1 @@ -1332,7 +1341,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i DDalphaAMG_change_mu_sign( &tmp_status ); if( n ) - vector_copy( sol, solution2 ); + vector_copy( sol, &solution2 ); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1340,7 +1349,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution2, sol ); + vector_copy( &solution2, sol ); // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); @@ -1349,7 +1358,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i case _SOLVE_SQ_EVEN : if ( n ) { - vector_copy( rhs, source ); + vector_copy( rhs, &source ); p->initial_guess_zero = 0; } else if ( n_shifts > 1 ) { THREADED(threading[0]->n_core) @@ -1362,11 +1371,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); - vector_copy( source, rhs ); + vector_copy( &source, rhs ); } if( n ) - correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); // read NOTE RESIDUAL THREADED(threading[0]->n_core) @@ -1374,7 +1383,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->tol = tol[n]/2.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution, sol ); + vector_copy( &solution, sol ); THREADED(threading[0]->n_core) #ifdef HAVE_TM1p1 @@ -1392,14 +1401,14 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i DDalphaAMG_change_mu_sign( &tmp_status ); if( n ) - vector_copy( sol, solution2 ); + vector_copy( sol, &solution2 ); // read NOTE RESIDUAL THREADED(threading[0]->n_core) nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; solver( ); if ( n < n_shifts-1 ) - vector_copy( solution2, sol ); + vector_copy( &solution2, sol ); // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); @@ -1449,8 +1458,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i if(g.n_flavours==2) { for ( mu=0; mu<4; mu++ ) { for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; - tmp2 = sol[j+6] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; + tmp2 = sol->vector_buffer[j+6] * twisted_bc; #ifndef BASIS4 vector1_out[n][i+2*(k+3*mu)] = creal(tmp1); vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1470,7 +1479,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i #endif for ( mu=0; mu<4; mu++ ) for ( k=0; k<3; k++, j++ ) { - tmp1 = sol[j] * twisted_bc; + tmp1 = sol->vector_buffer[j] * twisted_bc; #ifndef BASIS4 vector1_out[n][i+2*(k+3*mu)] = creal(tmp1); vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1); @@ -1488,10 +1497,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i p->initial_guess_zero = 1; if ( n_shifts > 0 ) { - FREE( source, complex_double, l.inner_vector_size ); - FREE( solution, complex_double, l.inner_vector_size ); + vector_double_free( &source, &l, no_threading); + vector_double_free( &solution, &l, no_threading); if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) - FREE( solution2, complex_double, l.inner_vector_size ); + vector_double_free( &solution2, &l, no_threading); } @@ -1499,8 +1508,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i if (precision_changed) { g.mixed_precision=2; // recovering pointer from x and b - p->b = vb; - p->x = vx; + p->b.vector_buffer = vb; + p->x.vector_buffer = vx; } #endif @@ -1533,8 +1542,8 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in from=ltmp->next_level; to=ltmp; } - vector_float rhs = from->p_float.b; - vector_float sol = to->p_float.x; + vector_float *rhs = &(from->p_float.b); + vector_float *sol = &(to->p_float.x); double t0, t1; t0 = MPI_Wtime(); @@ -1559,7 +1568,7 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in i = 2*j; for ( mu=0; munum_lattice_site_var; mu++, j++ ) - rhs[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]); + rhs->vector_buffer[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]); } switch(_TYPE) { @@ -1596,8 +1605,8 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in i = 2*j; for ( mu=0; munum_lattice_site_var; mu++, j++ ) { - vector_out[i+2*mu] = (double) creal(sol[j]); - vector_out[i+2*mu+1] = (double) cimag(sol[j]); + vector_out[i+2*mu] = (double) creal(sol->vector_buffer[j]); + vector_out[i+2*mu+1] = (double) cimag(sol->vector_buffer[j]); } } @@ -1839,7 +1848,9 @@ void DDalphaAMG_define_vector_const( double *vector, double re, double im ) { if(vector!=NULL){ int start, end; compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_define( (vector_double) vector, re+I*im, start, end, &l ); + vector_double vec; + vec.vector_buffer= (buffer_double) vector; + vector_double_define( &vec, re+I*im, start, end, &l ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1852,7 +1863,9 @@ void DDalphaAMG_define_vector_rand( double *vector ) { if(vector!=NULL){ int start, end; compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_define_random( (vector_double) vector, start, end, &l ); + vector_double vec; + vec.vector_buffer= (buffer_double) vector; + vector_double_define_random( &vec, start, end, &l ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1865,7 +1878,9 @@ double DDalphaAMG_vector_norm( double *vector ) { double norm = 0; THREADED(threading[0]->n_core) if(vector!=NULL){ - norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); + vector_double vec; + vec.vector_buffer = (buffer_double) vector; + norm = global_norm_double( &vec, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1880,7 +1895,9 @@ void DDalphaAMG_vector_saxpy( double *vector_out, double a, double *x, double *y if(vector_out!=NULL && x!=NULL && y!=NULL){ int start, end; compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, start, end, &l ); + vector_double vec_out, xx, yy; + vec_out.vector_buffer= (buffer_double) vector_out; xx.vector_buffer= (buffer_double) x; yy.vector_buffer= (buffer_double) y; + vector_double_saxpy( &vec_out, &xx, &yy, a, start, end, &l ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); diff --git a/src/blas_vectorized.h b/src/blas_vectorized.h deleted file mode 100644 index 645c457..0000000 --- a/src/blas_vectorized.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef BLAS_VECTORIZED_H -#define BLAS_VECTORIZED_H - -// BLAS naming convention: LDA = leading dimension of A -#ifdef SSE -#include "sse_blas_vectorized.h" -#endif - -// C=A*B+C -static inline void cgemv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C) -{ -#ifdef SSE - sse_cgemv( N, A, lda, B, C ); -#endif -} - -// C=-A*B+C -static inline void cgenmv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C) -{ -#ifdef SSE - sse_cgenmv( N, A, lda, B, C ); -#endif -} - -// C=A*B+C with padded layout -static inline void cgemv_padded(const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C) -{ -#ifdef SSE - sse_cgemv_padded( N, A, lda, padded, B, C ); -#endif -} - -// C=-A*B+C with padded layout -static inline void cgenmv_padded(const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C) -{ -#ifdef SSE - sse_cgenmv_padded( N, A, lda, padded, B, C ); -#endif -} - - -static inline void cgem_inverse(const int N, OPERATOR_TYPE_float *A_inverse, OPERATOR_TYPE_float *A, int lda) -{ -#ifdef SSE - sse_cgem_inverse( N, A_inverse, A, lda ); -#endif -} - -#endif // BLAS_VECTORIZED_H diff --git a/src/clifford.h b/src/clifford.h index 6521566..9307579 100644 --- a/src/clifford.h +++ b/src/clifford.h @@ -99,64 +99,6 @@ #define GAMMA_X_SPIN2_VAL I #define GAMMA_X_SPIN3_CO 1 #define GAMMA_X_SPIN3_VAL -I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN -1 - #define GAMMA_T_SPIN1_RE_SIGN -1 - #define GAMMA_T_SPIN2_RE_SIGN -1 - #define GAMMA_T_SPIN3_RE_SIGN -1 - #define GAMMA_T_SPIN0_IM_SIGN -1 - #define GAMMA_T_SPIN1_IM_SIGN -1 - #define GAMMA_T_SPIN2_IM_SIGN -1 - #define GAMMA_T_SPIN3_IM_SIGN -1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN +1 - #define GAMMA_Z_SPIN1_RE_SIGN +1 - #define GAMMA_Z_SPIN2_RE_SIGN -1 - #define GAMMA_Z_SPIN3_RE_SIGN -1 - #define GAMMA_Z_SPIN0_IM_SIGN -1 - #define GAMMA_Z_SPIN1_IM_SIGN -1 - #define GAMMA_Z_SPIN2_IM_SIGN +1 - #define GAMMA_Z_SPIN3_IM_SIGN +1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN +1 - #define GAMMA_X_SPIN1_RE_SIGN -1 - #define GAMMA_X_SPIN2_RE_SIGN -1 - #define GAMMA_X_SPIN3_RE_SIGN +1 - #define GAMMA_X_SPIN0_IM_SIGN -1 - #define GAMMA_X_SPIN1_IM_SIGN +1 - #define GAMMA_X_SPIN2_IM_SIGN +1 - #define GAMMA_X_SPIN3_IM_SIGN -1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif /* ------------------------------------------------- */ #else @@ -224,64 +166,6 @@ #define GAMMA_X_SPIN2_VAL I #define GAMMA_X_SPIN3_CO 0 #define GAMMA_X_SPIN3_VAL I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN -1 - #define GAMMA_T_SPIN1_RE_SIGN -1 - #define GAMMA_T_SPIN2_RE_SIGN -1 - #define GAMMA_T_SPIN3_RE_SIGN -1 - #define GAMMA_T_SPIN0_IM_SIGN -1 - #define GAMMA_T_SPIN1_IM_SIGN -1 - #define GAMMA_T_SPIN2_IM_SIGN -1 - #define GAMMA_T_SPIN3_IM_SIGN -1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN +1 - #define GAMMA_Z_SPIN1_RE_SIGN -1 - #define GAMMA_Z_SPIN2_RE_SIGN -1 - #define GAMMA_Z_SPIN3_RE_SIGN +1 - #define GAMMA_Z_SPIN0_IM_SIGN -1 - #define GAMMA_Z_SPIN1_IM_SIGN +1 - #define GAMMA_Z_SPIN2_IM_SIGN +1 - #define GAMMA_Z_SPIN3_IM_SIGN -1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN +1 - #define GAMMA_Y_SPIN1_RE_SIGN -1 - #define GAMMA_Y_SPIN2_RE_SIGN -1 - #define GAMMA_Y_SPIN3_RE_SIGN +1 - #define GAMMA_Y_SPIN0_IM_SIGN +1 - #define GAMMA_Y_SPIN1_IM_SIGN -1 - #define GAMMA_Y_SPIN2_IM_SIGN -1 - #define GAMMA_Y_SPIN3_IM_SIGN +1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN +1 - #define GAMMA_X_SPIN1_RE_SIGN +1 - #define GAMMA_X_SPIN2_RE_SIGN -1 - #define GAMMA_X_SPIN3_RE_SIGN -1 - #define GAMMA_X_SPIN0_IM_SIGN -1 - #define GAMMA_X_SPIN1_IM_SIGN -1 - #define GAMMA_X_SPIN2_IM_SIGN +1 - #define GAMMA_X_SPIN3_IM_SIGN +1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif /* ------------------------------------------------- */ #else #ifdef BASIS2 @@ -346,64 +230,6 @@ #define GAMMA_X_SPIN2_VAL -I #define GAMMA_X_SPIN3_CO 0 #define GAMMA_X_SPIN3_VAL -I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN +1 - #define GAMMA_T_SPIN1_RE_SIGN +1 - #define GAMMA_T_SPIN2_RE_SIGN +1 - #define GAMMA_T_SPIN3_RE_SIGN +1 - #define GAMMA_T_SPIN0_IM_SIGN +1 - #define GAMMA_T_SPIN1_IM_SIGN +1 - #define GAMMA_T_SPIN2_IM_SIGN +1 - #define GAMMA_T_SPIN3_IM_SIGN +1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN -1 - #define GAMMA_Z_SPIN1_RE_SIGN +1 - #define GAMMA_Z_SPIN2_RE_SIGN +1 - #define GAMMA_Z_SPIN3_RE_SIGN -1 - #define GAMMA_Z_SPIN0_IM_SIGN +1 - #define GAMMA_Z_SPIN1_IM_SIGN -1 - #define GAMMA_Z_SPIN2_IM_SIGN -1 - #define GAMMA_Z_SPIN3_IM_SIGN +1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN -1 - #define GAMMA_X_SPIN1_RE_SIGN -1 - #define GAMMA_X_SPIN2_RE_SIGN +1 - #define GAMMA_X_SPIN3_RE_SIGN +1 - #define GAMMA_X_SPIN0_IM_SIGN +1 - #define GAMMA_X_SPIN1_IM_SIGN +1 - #define GAMMA_X_SPIN2_IM_SIGN -1 - #define GAMMA_X_SPIN3_IM_SIGN -1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif #else #ifdef BASIS3 // Basis used in the QOPQDP Code (by James Osborn/USQCD) @@ -467,64 +293,6 @@ #define GAMMA_X_SPIN2_VAL -I #define GAMMA_X_SPIN3_CO 1 #define GAMMA_X_SPIN3_VAL I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN +1 - #define GAMMA_T_SPIN1_RE_SIGN +1 - #define GAMMA_T_SPIN2_RE_SIGN +1 - #define GAMMA_T_SPIN3_RE_SIGN +1 - #define GAMMA_T_SPIN0_IM_SIGN +1 - #define GAMMA_T_SPIN1_IM_SIGN +1 - #define GAMMA_T_SPIN2_IM_SIGN +1 - #define GAMMA_T_SPIN3_IM_SIGN +1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN -1 - #define GAMMA_Z_SPIN1_RE_SIGN -1 - #define GAMMA_Z_SPIN2_RE_SIGN +1 - #define GAMMA_Z_SPIN3_RE_SIGN +1 - #define GAMMA_Z_SPIN0_IM_SIGN +1 - #define GAMMA_Z_SPIN1_IM_SIGN +1 - #define GAMMA_Z_SPIN2_IM_SIGN -1 - #define GAMMA_Z_SPIN3_IM_SIGN -1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN -1 - #define GAMMA_X_SPIN1_RE_SIGN +1 - #define GAMMA_X_SPIN2_RE_SIGN +1 - #define GAMMA_X_SPIN3_RE_SIGN -1 - #define GAMMA_X_SPIN0_IM_SIGN +1 - #define GAMMA_X_SPIN1_IM_SIGN -1 - #define GAMMA_X_SPIN2_IM_SIGN -1 - #define GAMMA_X_SPIN3_IM_SIGN +1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif #else #ifdef BASIS4 // tmLQCD BASIS with an addition change of sign in gamma5 @@ -589,100 +357,10 @@ #define GAMMA_X_SPIN2_VAL -I #define GAMMA_X_SPIN3_CO 0 #define GAMMA_X_SPIN3_VAL -I -#ifdef SSE - #define GAMMA_T_SPIN0_RE_SIGN -1 - #define GAMMA_T_SPIN1_RE_SIGN -1 - #define GAMMA_T_SPIN2_RE_SIGN -1 - #define GAMMA_T_SPIN3_RE_SIGN -1 - #define GAMMA_T_SPIN0_IM_SIGN -1 - #define GAMMA_T_SPIN1_IM_SIGN -1 - #define GAMMA_T_SPIN2_IM_SIGN -1 - #define GAMMA_T_SPIN3_IM_SIGN -1 - #define GAMMA_T_SPIN0_OFFSET 0 - #define GAMMA_T_SPIN1_OFFSET 0 - #define GAMMA_T_SPIN2_OFFSET 0 - #define GAMMA_T_SPIN3_OFFSET 0 - - #define GAMMA_Z_SPIN0_RE_SIGN +1 - #define GAMMA_Z_SPIN1_RE_SIGN -1 - #define GAMMA_Z_SPIN2_RE_SIGN -1 - #define GAMMA_Z_SPIN3_RE_SIGN +1 - #define GAMMA_Z_SPIN0_IM_SIGN -1 - #define GAMMA_Z_SPIN1_IM_SIGN +1 - #define GAMMA_Z_SPIN2_IM_SIGN +1 - #define GAMMA_Z_SPIN3_IM_SIGN -1 - #define GAMMA_Z_SPIN0_OFFSET 1 - #define GAMMA_Z_SPIN1_OFFSET 1 - #define GAMMA_Z_SPIN2_OFFSET 1 - #define GAMMA_Z_SPIN3_OFFSET 1 - - #define GAMMA_Y_SPIN0_RE_SIGN -1 - #define GAMMA_Y_SPIN1_RE_SIGN +1 - #define GAMMA_Y_SPIN2_RE_SIGN +1 - #define GAMMA_Y_SPIN3_RE_SIGN -1 - #define GAMMA_Y_SPIN0_IM_SIGN -1 - #define GAMMA_Y_SPIN1_IM_SIGN +1 - #define GAMMA_Y_SPIN2_IM_SIGN +1 - #define GAMMA_Y_SPIN3_IM_SIGN -1 - #define GAMMA_Y_SPIN0_OFFSET 0 - #define GAMMA_Y_SPIN1_OFFSET 0 - #define GAMMA_Y_SPIN2_OFFSET 0 - #define GAMMA_Y_SPIN3_OFFSET 0 - - #define GAMMA_X_SPIN0_RE_SIGN -1 - #define GAMMA_X_SPIN1_RE_SIGN -1 - #define GAMMA_X_SPIN2_RE_SIGN +1 - #define GAMMA_X_SPIN3_RE_SIGN +1 - #define GAMMA_X_SPIN0_IM_SIGN +1 - #define GAMMA_X_SPIN1_IM_SIGN +1 - #define GAMMA_X_SPIN2_IM_SIGN -1 - #define GAMMA_X_SPIN3_IM_SIGN -1 - #define GAMMA_X_SPIN0_OFFSET 1 - #define GAMMA_X_SPIN1_OFFSET 1 - #define GAMMA_X_SPIN2_OFFSET 1 - #define GAMMA_X_SPIN3_OFFSET 1 - - #define GAMMA_T_SHUFFLE(A) A - #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) - #define GAMMA_Y_SHUFFLE(A) A - #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1)) -#endif /* ------------------------------------------------- */ + /* ------------------------------------------------- */ #endif #endif #endif #endif #endif - -#ifdef SSE -static const int gamma_co[4][4] = { - {GAMMA_T_SPIN0_CO, GAMMA_T_SPIN1_CO, GAMMA_T_SPIN2_CO, GAMMA_T_SPIN3_CO}, - {GAMMA_Z_SPIN0_CO, GAMMA_Z_SPIN1_CO, GAMMA_Z_SPIN2_CO, GAMMA_Z_SPIN3_CO}, - {GAMMA_Y_SPIN0_CO, GAMMA_Y_SPIN1_CO, GAMMA_Y_SPIN2_CO, GAMMA_Y_SPIN3_CO}, - {GAMMA_X_SPIN0_CO, GAMMA_X_SPIN1_CO, GAMMA_X_SPIN2_CO, GAMMA_X_SPIN3_CO}}; - -static const double complex gamma_val[4][4] = { - {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL}, - {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL}, - {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL}, - {GAMMA_X_SPIN0_VAL, GAMMA_X_SPIN1_VAL, GAMMA_X_SPIN2_VAL, GAMMA_X_SPIN3_VAL}}; - -static const int gamma_offset[4][4] = { - {GAMMA_T_SPIN0_OFFSET,GAMMA_T_SPIN1_OFFSET,GAMMA_T_SPIN2_OFFSET,GAMMA_T_SPIN3_OFFSET}, - {GAMMA_Z_SPIN0_OFFSET,GAMMA_Z_SPIN1_OFFSET,GAMMA_Z_SPIN2_OFFSET,GAMMA_Z_SPIN3_OFFSET}, - {GAMMA_Y_SPIN0_OFFSET,GAMMA_Y_SPIN1_OFFSET,GAMMA_Y_SPIN2_OFFSET,GAMMA_Y_SPIN3_OFFSET}, - {GAMMA_X_SPIN0_OFFSET,GAMMA_X_SPIN1_OFFSET,GAMMA_X_SPIN2_OFFSET,GAMMA_X_SPIN3_OFFSET}}; - -static const int gamma_re_sign[4][4] = { - {GAMMA_T_SPIN0_RE_SIGN,GAMMA_T_SPIN1_RE_SIGN,GAMMA_T_SPIN2_RE_SIGN,GAMMA_T_SPIN3_RE_SIGN}, - {GAMMA_Z_SPIN0_RE_SIGN,GAMMA_Z_SPIN1_RE_SIGN,GAMMA_Z_SPIN2_RE_SIGN,GAMMA_Z_SPIN3_RE_SIGN}, - {GAMMA_Y_SPIN0_RE_SIGN,GAMMA_Y_SPIN1_RE_SIGN,GAMMA_Y_SPIN2_RE_SIGN,GAMMA_Y_SPIN3_RE_SIGN}, - {GAMMA_X_SPIN0_RE_SIGN,GAMMA_X_SPIN1_RE_SIGN,GAMMA_X_SPIN2_RE_SIGN,GAMMA_X_SPIN3_RE_SIGN}}; - -static const int gamma_im_sign[4][4] = { - {GAMMA_T_SPIN0_IM_SIGN,GAMMA_T_SPIN1_IM_SIGN,GAMMA_T_SPIN2_IM_SIGN,GAMMA_T_SPIN3_IM_SIGN}, - {GAMMA_Z_SPIN0_IM_SIGN,GAMMA_Z_SPIN1_IM_SIGN,GAMMA_Z_SPIN2_IM_SIGN,GAMMA_Z_SPIN3_IM_SIGN}, - {GAMMA_Y_SPIN0_IM_SIGN,GAMMA_Y_SPIN1_IM_SIGN,GAMMA_Y_SPIN2_IM_SIGN,GAMMA_Y_SPIN3_IM_SIGN}, - {GAMMA_X_SPIN0_IM_SIGN,GAMMA_X_SPIN1_IM_SIGN,GAMMA_X_SPIN2_IM_SIGN,GAMMA_X_SPIN3_IM_SIGN}}; -#endif - #endif diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c index b0baa6a..a5c4e5a 100644 --- a/src/coarse_oddeven_generic.c +++ b/src/coarse_oddeven_generic.c @@ -254,62 +254,59 @@ void coarse_selfcoupling_LU_doublet_decomposition_PRECISION( config_PRECISION ou #endif -void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION A, level_struct *l ) { +void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION A, level_struct *l ) { register int i, j, n2 = l->num_lattice_site_var; // solve x = U^(-1) L^(-1) b // forward substitution with L for ( i=0; ivector_buffer[i] = b->vector_buffer[i]; for ( j=0; jvector_buffer[i] = x->vector_buffer[i] - A[i*n2+j]*x->vector_buffer[j]; } } // backward substitution with U for ( i=n2-1; i>=0; i-- ) { for ( j=i+1; jvector_buffer[i] = x->vector_buffer[i] - A[i*n2+j]*x->vector_buffer[j]; } - x[i] = x[i]/A[i*(n2+1)]; + x->vector_buffer[i] = x->vector_buffer[i]/A[i*(n2+1)]; } } -void coarse_LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION A, level_struct *l ) { +void coarse_LU_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION A, level_struct *l ) { register int i, j, n2 = l->num_lattice_site_var; // y = Ax // multiplication with U for ( i=0; ivector_buffer[i] = A[i*(n2+1)]*x->vector_buffer[i]; for ( j=i+1; jvector_buffer[i] += A[i*n2+j]*x->vector_buffer[j]; } // multiplication with L for ( i=n2-1; i>0; i-- ) for ( j=0; jvector_buffer[i] += A[i*n2+j]*y->vector_buffer[j]; } -void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; compute_core_start_end_custom( 0, op->num_even_sites, &start, &end, l, threading, 1 ); // even sites -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( y, x, op, start, end, l ); -#else - coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l ); -#endif } -void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION + + vector_PRECISION x_pt, y_pt; int num_site_var=l->num_lattice_site_var, oo_inv_size = SQUARE(num_site_var); #ifdef HAVE_TM1p1 @@ -317,72 +314,53 @@ void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_ #else config_PRECISION sc = op->clover_oo_inv; #endif - + compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); - x += num_site_var*(op->num_even_sites+start); - y += num_site_var*(op->num_even_sites+start); + x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start); + y_pt.vector_buffer = y->vector_buffer + num_site_var*(op->num_even_sites+start); sc += oo_inv_size*start; for ( int i=start; inum_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 ); - coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l ); -#endif } -void coarse_diag_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l ) { +void coarse_diag_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l ) { coarse_diag_ee_PRECISION( y, x, op, l, no_threading ); coarse_diag_oo_PRECISION( y, x, op, l, no_threading ); } -void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; + vector_PRECISION x_pt, y_pt; compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); // odd sites int num_site_var = l->num_lattice_site_var, oo_inv_size = SQUARE(num_site_var); -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION #ifdef HAVE_TM1p1 config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv; #else config_PRECISION sc = op->clover_oo_inv; -#endif -#else - int lda = SIMD_LENGTH_PRECISION*((num_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - oo_inv_size = 2*num_site_var*lda; -#ifdef HAVE_TM1p1 - OPERATOR_TYPE_PRECISION *sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv_vectorized:op->clover_oo_inv_vectorized; -#else - OPERATOR_TYPE_PRECISION *sc = op->clover_oo_inv_vectorized; -#endif #endif - x += num_site_var*(op->num_even_sites+start); - y += num_site_var*(op->num_even_sites+start); + x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start); + y_pt.vector_buffer = y->vector_buffer + num_site_var*(op->num_even_sites+start); sc += oo_inv_size*start; for ( int i=start; inum_odd_sites, &start, &end, l, threading, 1); -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - int size = SQUARE(2*nv); for( int i=start; iclover_oo_inv+i*size, op, op->num_even_sites+i, l ); @@ -408,24 +384,6 @@ void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread coarse_selfcoupling_LU_doublet_decomposition_PRECISION( op->clover_doublet_oo_inv+i*size_doublet, op, op->num_even_sites+i, l ); #endif - -#else - - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int size_v = 2*2*nv*column_offset; - for( int i=start; iclover_oo_inv_vectorized + i*size_v, - op->clover_vectorized + (op->num_even_sites+i)*size_v, column_offset ); - -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int size_doublet_v = 2*4*nv*column_doublet_offset; - for( int i=start; iclover_doublet_oo_inv_vectorized + i*size_doublet_v, - op->clover_doublet_vectorized + (op->num_even_sites+i)*size_doublet_v, column_doublet_offset ); -#endif - -#endif } void coarse_oddeven_PRECISION_set_couplings( level_struct *l, struct Thread *threading ) { @@ -444,16 +402,15 @@ void coarse_oddeven_alloc_PRECISION( level_struct *l ) { operator_PRECISION_alloc( op, _ODDEVEN, l ); // buffers - MALLOC( op->buffer, complex_PRECISION*, 2 ); - op->buffer[0] = NULL; + MALLOC( op->buffer, vector_PRECISION, 2 ); + for (int k=0; k<2; k++ ){ + vector_PRECISION_init( &(op->buffer[k]) ); #ifdef HAVE_TM1p1 - MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size ); - op->buffer[1] = op->buffer[0] + 2*l->vector_size; + vector_PRECISION_alloc( &(op->buffer[k]), _ORDINARY, 2, l, no_threading ); #else - MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); - op->buffer[1] = op->buffer[0] + l->vector_size; + vector_PRECISION_alloc( &(op->buffer[k]), _ORDINARY, 1, l, no_threading ); #endif - + } for ( mu=0; mu<4; mu++ ) { le[mu] = l->local_lattice[mu]; N[mu] = le[mu]+1; @@ -479,23 +436,10 @@ void coarse_oddeven_alloc_PRECISION( level_struct *l ) { } } -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - MALLOC( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites ); #ifdef HAVE_TM1p1 MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites ); #endif - -#else - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION ); -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION ); -#endif - -#endif - // define data layout eot = op->index_table; define_eot( eot, N, l ); @@ -601,39 +545,22 @@ void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, void coarse_oddeven_free_PRECISION( level_struct *l ) { - int nv = l->num_parent_eig_vect, vs = l->vector_size; + int nv = l->num_parent_eig_vect; operator_PRECISION_struct *op = &(l->oe_op_PRECISION); operator_PRECISION_free( op, _ODDEVEN, l ); - coarse_operator_PRECISION_free_vectorized( op, l ); - -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION FREE( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites ); #ifdef HAVE_TM1p1 FREE( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites ); #endif - -#else - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites ); -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites ); -#endif - -#endif - -#ifdef HAVE_TM1p1 - FREE( op->buffer[0], complex_PRECISION, 4*vs ); -#else - FREE( op->buffer[0], complex_PRECISION, 2*vs ); -#endif - FREE( op->buffer, complex_PRECISION*, 2 ); + for (int k=0; k<2; k++ ) + vector_PRECISION_free( &(op->buffer[k]), l, no_threading ); + FREE( op->buffer, vector_PRECISION, 2 ); } -void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) @@ -646,6 +573,9 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o vector_PRECISION in_pt, out_pt; config_PRECISION D_pt; + in_pt = *in; + out_pt = *out; + int core_start; int core_end; @@ -665,7 +595,7 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_MASTER(threading) @@ -681,49 +611,49 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o // compute U_mu^dagger coupling for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -738,30 +668,30 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o // compute U_mu couplings for ( i=core_start; ineighbor_table[index]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index]; index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -770,18 +700,9 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o } -void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION -#ifndef COMM_HIDING_COARSEOP - int sign = -1; - coarse_pn_hopping_term_PRECISION_vectorized( out, in, op, amount, l, sign, threading); -#else - coarse_n_hopping_term_PRECISION_vectorized( out, in, op, amount, l, threading ); -#endif - return; -#else START_NO_HYPERTHREADS(threading) int mu, i, index, num_site_var=l->num_lattice_site_var, @@ -791,6 +712,8 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; vector_PRECISION in_pt, out_pt; config_PRECISION D_pt; + in_pt = *in; + out_pt = *out; int core_start; int core_end; @@ -811,7 +734,7 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_MASTER(threading) @@ -827,49 +750,49 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, // compute U_mu^dagger coupling for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } SYNC_CORES(threading) for ( i=core_start; ineighbor_table[index]; + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var; index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } for ( mu=0; mu<4; mu++ ) { // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l ); } } END_LOCKED_MASTER(threading) @@ -884,444 +807,35 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, // compute U_mu couplings for ( i=core_start; ineighbor_table[index]; + out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index]; D_pt = op->D + num_4link_var*op->neighbor_table[index]; index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X]; + coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l ); } START_LOCKED_MASTER(threading) if ( op->c.comm ) { for ( mu=0; mu<4; mu++ ) { // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l ); } } END_LOCKED_MASTER(threading) END_NO_HYPERTHREADS(threading) -#endif -} - - -void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*2*l->num_parent_eig_vect*column_offset; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu^dagger coupling - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; - index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - in_pt = in + num_site_var*op->neighbor_table[index+X]; - D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -#endif -} - - -void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, int sign, struct Thread *threading ) { - -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int link_offset = 2*2*l->num_parent_eig_vect*column_offset; - int *neighbor_fw = op->neighbor_table; - int *neighbor_bw = op->backward_neighbor_table; - - int core_start; - int core_end; - - void (*coarse_hopp)(vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, level_struct *l); - if(sign == +1) - coarse_hopp = coarse_hopp_PRECISION_vectorized; - else - coarse_hopp = coarse_n_hopp_PRECISION_vectorized; - - - if ( l->num_processes > 1 && op->c.comm ) { - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // send in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - END_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // prepare for sending to fw: compute hopping terms into forward boundary buffer - for ( i=core_start; inum_inner_lattice_sites) - continue; - out_pt = out + num_site_var*neighbor_fw[5*i+1+mu]; - in_pt = in + num_site_var*neighbor_fw[5*i]; - D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); - } - } - START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // send in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - END_LOCKED_MASTER(threading) - } - else - SYNC_CORES(threading) - - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - for ( i=core_start; i= l->num_inner_lattice_sites) - continue; - D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_bw[5*i+1+mu] + mu*link_offset; - in_pt = in + num_site_var*neighbor_bw[5*i+1+mu]; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); - } - - // compute U_mu couplings - for(int mu=0; mu<4; mu++) { - D_vectorized = op->D_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - in_pt = in + num_site_var*neighbor_fw[5*i+1+mu]; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); - } - } - - - // wait for terms from bw and add them - if ( l->num_processes > 1 && op->c.comm ) { - START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - END_LOCKED_MASTER(threading) - } - else - SYNC_CORES(threading) - - END_NO_HYPERTHREADS(threading) -#endif -} - - -void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*2*l->num_parent_eig_vect*column_offset; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // D is applied in an input-centric way - // this makes threading a bit ugly, is there a better way? - // compute U_mu^dagger coupling - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; - index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -#endif } @@ -1329,26 +843,26 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &p->b, &p->x, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); fgmres_PRECISION( p, l, threading ); // even to odd PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &p->b, &p->x, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) } -void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { // start and end indices for vector functions depending on thread int start; @@ -1364,16 +878,16 @@ void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECI coarse_diag_ee_PRECISION( out, in, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start, end, l ); + vector_PRECISION_define( &tmp[0], 0, start, end, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + coarse_hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading ); + coarse_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( out, &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); } @@ -1387,47 +901,47 @@ void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PR vector_PRECISION tmp = op->buffer[0]; SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start_even, end_even, l ); + vector_PRECISION_define( &tmp, 0, start_even, end_even, l ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l ); SYNC_CORES(threading) - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l ); PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &tmp, &p->x, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); - coarse_gamma5_PRECISION( tmp, tmp, start_even, end_even, l ); + coarse_gamma5_PRECISION( &tmp, &tmp, start_even, end_even, l ); SYNC_CORES(threading) - vector_PRECISION_plus( p->b, p->b, tmp, start_even, end_even, l ); + vector_PRECISION_plus( &p->b, &p->b, &tmp, start_even, end_even, l ); fgmres_PRECISION( p, l, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l ); SYNC_CORES(threading) - coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); + coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading ); SYNC_CORES(threading) // even to odd PROF_PRECISION_START( _NC, threading ); - vector_PRECISION_define( tmp, 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp, 0, start_odd, end_odd, l ); SYNC_CORES(threading) - coarse_n_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( &tmp, &p->x, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( p->b, tmp, op, l, threading ); - vector_PRECISION_plus( p->x, p->x, p->b, start_odd, end_odd, l ); + coarse_diag_oo_inv_PRECISION( &p->b, &tmp, op, l, threading ); + vector_PRECISION_plus( &p->x, &p->x, &p->b, start_odd, end_odd, l ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) } -void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start_even, end_even, start_odd, end_odd; compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var ); @@ -1440,16 +954,16 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P coarse_diag_ee_PRECISION( out, in, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + coarse_hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading ); + coarse_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading ); + coarse_n_hopping_term_PRECISION( out, &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) coarse_gamma5_PRECISION( out, out, start_even, end_even, l ); @@ -1457,52 +971,55 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P } -void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void coarse_odd_even_PRECISION_test( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { if ( g.odd_even ) { - vector_PRECISION buf1 = NULL, buf2 = NULL; - - PUBLIC_MALLOC( buf1, complex_PRECISION, 2*l->vector_size ); - buf2 = buf1 + l->vector_size; + vector_PRECISION buf[2]; + for(int i=0; i<2; i++){ + vector_PRECISION_init( &buf[i] ); + vector_PRECISION_alloc( &buf[i], _ORDINARY, 1, l, threading ); + } + START_LOCKED_MASTER(threading) // transformation part - vector_PRECISION_copy( buf1, in, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &buf[0], in, 0, l->inner_vector_size, l ); // even to odd vector_PRECISION_define( out, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) - coarse_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); - coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading ); + coarse_hopping_term_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); + coarse_diag_oo_inv_PRECISION( &buf[1], out, &(l->oe_op_PRECISION), l, threading ); START_LOCKED_MASTER(threading) - vector_PRECISION_plus( buf1, buf1, buf2, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_plus( &buf[0], &buf[0], &buf[1], l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) // block diagonal part if ( g.method == 6 ) { - g5D_coarse_apply_schur_complement_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading ); + g5D_coarse_apply_schur_complement_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading ); } else { - coarse_apply_schur_complement_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading ); + coarse_apply_schur_complement_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading ); } - coarse_diag_oo_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading ); + coarse_diag_oo_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading ); // back transformation part - coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading ); + coarse_diag_oo_inv_PRECISION( &buf[1], out, &(l->oe_op_PRECISION), l, threading ); if ( g.method == 6 ) { START_LOCKED_MASTER(threading) coarse_gamma5_PRECISION( out, out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - vector_PRECISION_define( buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - coarse_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); - coarse_gamma5_PRECISION( buf1, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - vector_PRECISION_plus( out, out, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + vector_PRECISION_define( &buf[0], 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + coarse_hopping_term_PRECISION( &buf[0], &buf[1], &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + coarse_gamma5_PRECISION( &buf[0], &buf[0], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); + vector_PRECISION_plus( out, out, &buf[0], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); END_LOCKED_MASTER(threading) } else { - coarse_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); + coarse_hopping_term_PRECISION( out, &buf[1], &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); } - PUBLIC_FREE( buf1, complex_PRECISION, 2*l->vector_size ); + for(int i=0; i<2; i++) + vector_PRECISION_free( &buf[i], l, threading ); } } diff --git a/src/coarse_oddeven_generic.h b/src/coarse_oddeven_generic.h index e1481be..ec33b23 100644 --- a/src/coarse_oddeven_generic.h +++ b/src/coarse_oddeven_generic.h @@ -34,30 +34,24 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, + void coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, + void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, int sign, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); - void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1, + void coarse_odd_even_PRECISION_test( vector_PRECISION *c4, vector_PRECISION *c1, level_struct *l, struct Thread *threading ); - void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); #endif diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index 33338d3..641d9af 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -37,32 +37,6 @@ void coarse_operator_PRECISION_free( level_struct *l ) { operator_PRECISION_free( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level ); - coarse_operator_PRECISION_free_vectorized( &(l->next_level->s_PRECISION.op), l->next_level ); -} - -void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ) { - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - if( op->D_vectorized != NULL ) { - int n2 = (l->depth>0 && l->level>0) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // 2 is for complex, 4 is for 4 directions - FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 ); - FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 ); - } -#endif - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if( op->clover_vectorized != NULL ) { - int n = l->num_inner_lattice_sites; - int column_offset = SIMD_LENGTH_PRECISION*((2*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*2*l->num_parent_eig_vect*column_offset*n ); -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_parent_eig_vect*column_doublet_offset*n ); -#endif - } -#endif } void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { @@ -70,7 +44,8 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { double t0, t1; t0 = MPI_Wtime(); - vector_PRECISION buffer1 = l->vbuf_PRECISION[4], buffer2 = l->vbuf_PRECISION[5]; + vector_PRECISION buffer1, buffer2; + buffer1.vector_buffer = l->vbuf_PRECISION[4].vector_buffer; buffer2.vector_buffer = l->vbuf_PRECISION[5].vector_buffer; int mu, n = l->num_eig_vect, i, j, D_size = l->next_level->D_size, @@ -93,22 +68,22 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { for ( i=0; is_PRECISION.op.c), l ); + negative_sendrecv_PRECISION( &V[i], mu, &(l->s_PRECISION.op.c), l ); } // apply self coupling of block-and-2spin-restricted dirac operator for each aggregate - aggregate_self_coupling( buffer1, buffer2, V[i], &(l->s_PRECISION), l ); + aggregate_self_coupling( &buffer1, &buffer2, &V[i], &(l->s_PRECISION), l ); // calculate selfcoupling entries of the coarse grid operator - set_coarse_self_coupling_PRECISION( buffer1, buffer2, V, i, l ); + set_coarse_self_coupling_PRECISION( &buffer1, &buffer2, V, i, l ); //odd_proj - aggregate_block( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l ); - set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l ); + aggregate_block( &buffer1, &buffer2, &V[i], l->s_PRECISION.op.odd_proj, l ); + set_block_diagonal_PRECISION( &buffer1, &buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l ); for ( mu=0; mu<4; mu++ ) { // finish updating ghostcells of V[i] negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); // apply 2spin-restricted dirac operator for direction mu for all aggregates - aggregate_neighbor_coupling( buffer1, buffer2, V[i], mu, &(l->s_PRECISION), l ); - set_coarse_neighbor_coupling_PRECISION( buffer1, buffer2, V, mu, i, l ); + aggregate_neighbor_coupling( &buffer1, &buffer2, &V[i], mu, &(l->s_PRECISION), l ); + set_coarse_neighbor_coupling_PRECISION( &buffer1, &buffer2, V, mu, i, l ); } } @@ -129,7 +104,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t PRECISION mf = (g.mu_factor[l->depth]) ? g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth]:0; if ( mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_even_shift == 0 && mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_odd_shift == 0 ) - vector_PRECISION_define( l->next_level->op_PRECISION.tm_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); + buffer_PRECISION_define( l->next_level->op_PRECISION.tm_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); else tm_term_PRECISION_setup( mf*l->s_PRECISION.op.mu, mf*l->s_PRECISION.op.mu_even_shift, mf*l->s_PRECISION.op.mu_odd_shift, &(l->next_level->op_PRECISION), @@ -140,7 +115,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t PRECISION ef = (g.epsbar_factor[l->depth]) ? g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth]:0; if ( ef*l->s_PRECISION.op.epsbar == 0 && ef*l->s_PRECISION.op.epsbar_ig5_even_shift == 0 && ef*l->s_PRECISION.op.epsbar_ig5_odd_shift == 0 ) - vector_PRECISION_define( l->next_level->op_PRECISION.epsbar_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); + buffer_PRECISION_define( l->next_level->op_PRECISION.epsbar_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level ); else epsbar_term_PRECISION_setup( ef*l->s_PRECISION.op.epsbar, ef*l->s_PRECISION.op.epsbar_ig5_even_shift, ef*l->s_PRECISION.op.epsbar_ig5_odd_shift, &(l->next_level->op_PRECISION), @@ -149,7 +124,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t } -void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, +void set_block_diagonal_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) { // U(x) = [ A 0 , A=A*, D=D* @@ -162,16 +137,16 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION s aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, offset = l->num_parent_eig_vect, block_site_size = (num_eig_vect*(num_eig_vect+1)); - vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; + buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION block_pt; for ( k=0; k<=n; k++ ) { k1 = (n*(n+1))/2+k; k2 = (n*(n+1))/2+k+block_site_size/2; for ( j=0; jvector_buffer + j*aggregate_size; + spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size; + interpolation_data = V[k].vector_buffer + j*aggregate_size; block_pt = block + j*block_site_size; for ( i=0; iis_PRECISION.num_agg, @@ -194,7 +169,7 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, offset = l->num_parent_eig_vect, clover_site_size = (num_eig_vect*(2*num_eig_vect+1)); - vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; + buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; // U(x) = [ A B , A=A*, D=D*, C = -B* @@ -205,9 +180,9 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI k1 = (n*(n+1))/2+k; k2 = (n*(n+1))/2+k+(num_eig_vect*(num_eig_vect+1))/2; for ( j=0; jvector_buffer + j*aggregate_size; + spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size; + interpolation_data = V[k].vector_buffer + j*aggregate_size; clover_pt = clover + j*clover_site_size; for ( i=0; ivector_buffer + j*aggregate_size; + spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size; + interpolation_data = V[k].vector_buffer + j*aggregate_size; clover_pt = clover + j*clover_site_size; for ( i=0; iis_PRECISION.num_agg, @@ -250,7 +225,7 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P D_link_size = num_eig_vect*num_eig_vect*4, *index_dir = l->is_PRECISION.agg_boundary_index[mu], aggregate_boundary_sites = l->is_PRECISION.agg_boundary_length[mu]/num_aggregates; - vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; + buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION D_pt, D = l->next_level->op_PRECISION.D; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* @@ -264,8 +239,8 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P D_pt = D+(j*4+mu)*D_link_size; for ( i=0; ivector_buffer + nlsv*index_dir[i1]; + interpolation_data = V[k].vector_buffer + nlsv*index_dir[i1]; i1++; // A for ( m=0; mvector_buffer + nlsv*index_dir[i1]; + interpolation_data = V[k].vector_buffer + nlsv*index_dir[i1]; i1++; // B for ( m=0; mnum_block_sites, *length = s->dir_length, **index = s->index, *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var, num_eig_vect = l->num_parent_eig_vect; - vector_PRECISION lphi = phi+start, leta = eta+start; - + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+start; leta.vector_buffer = eta->vector_buffer+start; + vector_PRECISION leta1=leta, leta2=leta, lphi1=lphi, lphi2=lphi; + // site-wise self coupling -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( eta, phi, &(s->op), (start/m), (start/m)+n, l); -#else - coarse_self_couplings_PRECISION_vectorized( eta, phi, &(s->op), (start/m), (start/m)+n, l ); -#endif // inner block couplings -#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION int hopp_size = 4 * SQUARE( num_eig_vect*2 ); config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size; @@ -320,34 +292,20 @@ void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi for ( int i=0; iop.D_vectorized + - (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + - (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; - ind = index[mu]; // mu direction - for ( int i=0; iop.neighbor_table, @@ -363,16 +321,16 @@ void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PR length = l->is_PRECISION.agg_length[mu]; index_dir = l->is_PRECISION.agg_index[mu]; for ( i=0; ivector_buffer + n*index2; eta1_pt.vector_buffer = eta1->vector_buffer + n*index1; eta2_pt.vector_buffer = eta2->vector_buffer + n*index1; + coarse_spinwise_n_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l ); + phi_pt.vector_buffer = phi->vector_buffer + n*index1; eta1_pt.vector_buffer = eta1->vector_buffer + n*index2; eta2_pt.vector_buffer = eta2->vector_buffer + n*index2; + coarse_spinwise_n_daggered_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l ); } } } -void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, +void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { int i, index1, index2, length = l->is_PRECISION.agg_boundary_length[mu], @@ -390,12 +348,12 @@ void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vecto index1 = index_dir[i]; index2 = neighbor[i]; D_pt = D + Dss*index1 + Dls*mu; - phi_pt = phi + n*index2; eta1_pt = eta1 + n*index1; eta2_pt = eta2 + n*index1; - coarse_spinwise_hopp_PRECISION( eta1_pt, eta2_pt, phi_pt, D_pt, l ); + phi_pt.vector_buffer = phi->vector_buffer + n*index2; eta1_pt.vector_buffer = eta1->vector_buffer + n*index1; eta2_pt.vector_buffer = eta2->vector_buffer + n*index1; + coarse_spinwise_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l ); } } -void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void coarse_self_couplings_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, @@ -419,39 +377,40 @@ void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi } -void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, +void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION block, level_struct *l ) { int length = l->inner_vector_size, num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length; + vector_PRECISION phi_pt=*phi, eta1_pt=*eta1, eta2_pt=*eta2, phi_end_pt; + phi_end_pt.vector_buffer=phi->vector_buffer+length; // U(x) = [ A 0 , A=A*, D=D* // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise // diagonal coupling - while ( phi_pt < phi_end_pt ) { + while ( phi_pt.vector_buffer< phi_end_pt.vector_buffer ) { // A - mvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect ); - vector_PRECISION_define( eta2_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); - block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; + mvp_PRECISION( eta1_pt.vector_buffer, block_pt, phi_pt.vector_buffer, num_eig_vect ); + vector_PRECISION_define( &eta2_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); + block_pt += block_step_size; eta1_pt.vector_buffer += num_eig_vect; eta2_pt.vector_buffer += num_eig_vect; phi_pt.vector_buffer += num_eig_vect; // D - vector_PRECISION_define( eta1_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); - mvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect ); - block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; + vector_PRECISION_define( &eta1_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l ); + mvp_PRECISION( eta2_pt.vector_buffer, block_pt, phi_pt.vector_buffer, num_eig_vect ); + block_pt += block_step_size; eta1_pt.vector_buffer += num_eig_vect; eta2_pt.vector_buffer += num_eig_vect; phi_pt.vector_buffer += num_eig_vect; } } -void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, +void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION clover, int length, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, clover_step_size2 = SQUARE(num_eig_vect); config_PRECISION clover_pt = clover; - vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2+num_eig_vect, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer+num_eig_vect, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A B , A=A*, D=D*, C = -B* // C D ] // storage order: upper triangle of A, upper triangle of D, B, columnwise @@ -483,181 +442,77 @@ void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, lev void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int nc_size = SQUARE(l->num_parent_eig_vect*2); - int n1, n2; - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 4*l->num_parent_eig_vect*column_offset; - - if ( l->depth > 0 && l->level>0 ) { - n1 = l->num_lattice_sites; - n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites; - } else { - n1 = l->num_inner_lattice_sites; - n2 = l->num_inner_lattice_sites; - } - int start, end; - compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1); - int n_per_core = end-start; - START_LOCKED_MASTER(threading) - if( op->D_vectorized == NULL ) { - // 2 is for complex, 4 is for 4 directions - MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 ); - MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 ); - } - END_LOCKED_MASTER(threading) - - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - // vectorize negative boundary - if ( n2>n1 ) { - compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1); - n_per_core = end-start; - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_parent_eig_vect); - } - SYNC_CORES(threading) -#endif - } void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { -#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - int n = l->num_inner_lattice_sites, nv = l->num_parent_eig_vect; - int sc_size = (nv)*(nv*2+1); - int start, end; - compute_core_start_end_custom(0, n, &start, &end, l, threading, 1); - int n_per_core = end-start; - - int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 2*2*nv*column_offset; - if( op->clover_vectorized == NULL ) { - START_LOCKED_MASTER(threading) - MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, offset_v*n, 64 ); - END_LOCKED_MASTER(threading) - } - copy_coarse_operator_clover_to_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_vectorized + start*offset_v, - n_per_core, nv); -#ifdef HAVE_TM - int tm_size = (nv)*(nv+1); - if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) - add_tm_term_to_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_vectorized + start*offset_v, - n_per_core, nv); -#endif - -#ifdef HAVE_TM1p1 - int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_doublet_v = 2*4*nv*column_doublet_offset; - int eps_size = (nv)*(nv+1); - if( op->clover_doublet_vectorized == NULL ) { - START_LOCKED_MASTER(threading) - MALLOC_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, offset_doublet_v*n, 64 ); - END_LOCKED_MASTER(threading) - } - copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_doublet_vectorized + start*offset_doublet_v, - n_per_core, nv); - if ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) - add_epsbar_term_to_doublet_vectorized_layout_PRECISION( - op->epsbar_term + start*eps_size, - op->clover_doublet_vectorized + start*offset_doublet_v, - n_per_core, nv); -#ifdef HAVE_TM - if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) - add_tm_term_to_doublet_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_doublet_vectorized + start*offset_doublet_v, - n_per_core, nv); -#endif -#endif - SYNC_CORES(threading) -#endif } -void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { +void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { int j, k=l->num_lattice_site_var/2; - vector_PRECISION eta_end; + buffer_PRECISION eta_end, eta_pt, phi_pt; + eta_end = eta->vector_buffer + end; + phi_pt = phi->vector_buffer + start; + eta_pt = eta->vector_buffer + start; - eta_end = eta+end; - phi += start; - eta += start; - - if ( eta != phi ) { - while ( eta < eta_end ) { + if ( eta_pt != phi_pt ) { + while ( eta_pt < eta_end ) { for ( j=0; jnum_lattice_site_var/4; - vector_PRECISION eta_end; + buffer_PRECISION eta_end, phi_pt, eta_pt; - eta_end = eta+end; - phi += start; - eta += start; + eta_end = eta->vector_buffer + end; + phi_pt = phi->vector_buffer + start; + eta_pt = eta->vector_buffer + start; - ASSERT( eta != phi ); - while ( eta < eta_end ) { - phi += k; + ASSERT( eta_pt != phi_pt ); + while ( eta_pt < eta_end ) { + phi_pt += k; for ( j=0; jnum_inner_lattice_sites, &start, &end, l, threading, 1); -#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION coarse_self_couplings_PRECISION( eta, phi, op, start, end, l); -#else - coarse_self_couplings_PRECISION_vectorized( eta, phi, op, start, end, l ); -#endif PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); -#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading ); -#else - coarse_hopping_term_PRECISION_vectorized( eta, phi, op, _FULL_SYSTEM, l, threading ); -#endif PROF_PRECISION_STOP( _NC, 1, threading ); } -void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void g5D_apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int start, end; compute_core_start_end_custom(0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var ); @@ -704,11 +551,11 @@ void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION } -void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - coarse_gamma5_PRECISION( l->vbuf_PRECISION[3], phi, threading->start_index[l->depth], threading->end_index[l->depth], l ); - apply_coarse_operator_PRECISION( eta, l->vbuf_PRECISION[3], op, l, threading ); + coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[3]), phi, threading->start_index[l->depth], threading->end_index[l->depth], l ); + apply_coarse_operator_PRECISION( eta, &(l->vbuf_PRECISION[3]), op, l, threading ); coarse_gamma5_PRECISION( eta, eta, threading->start_index[l->depth], threading->end_index[l->depth], l ); } @@ -716,68 +563,61 @@ void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECIS void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading ) { if ( !l->idle ) { - int vs = l->vector_size, ivs = l->inner_vector_size, - cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size; + int ivs = l->inner_vector_size, civs = l->next_level->inner_vector_size; PRECISION diff = 0; - vector_PRECISION vp1=NULL, vp2, vp3, vp4, vc1=NULL, vc2, vc3; + vector_PRECISION vp[4], vc[3]; + + for(int i=0; i<4; i++){ + vector_PRECISION_init( &vp[i] ); + vector_PRECISION_alloc( &vp[i], _ORDINARY, 1, l, threading ); + } - PUBLIC_MALLOC( vp1, complex_PRECISION, 4*vs ); - PUBLIC_MALLOC( vc1, complex_PRECISION, 3*cvs ); + for(int i=0; i<3; i++){ + vector_PRECISION_init( &vc[i] ); + vector_PRECISION_alloc( &vc[i], _ORDINARY, 1, l->next_level, threading ); + } SYNC_MASTER_TO_ALL(threading) - vp2 = vp1 + vs; vp3 = vp2 + vs; vp4 = vp3 + vs; vc2 = vc1 + cvs; vc3 = vc2 + cvs; - START_LOCKED_MASTER(threading) #ifdef HAVE_TM1p1 if(g.n_flavours == 1) #endif { -#ifdef INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION - double norm = 0.0; - double dot = 0.0; - float *op = (float *)l->is_PRECISION.operator; - float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1; - for ( int i=0; iinner_vector_size; i++ ) - norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); - for ( int i=0; iinner_vector_size; i++ ) - dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); - diff = dot/norm; -#else - diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading ) - / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading ); -#endif + diff = global_inner_product_PRECISION( &(l->is_PRECISION.interpolation[0]), &(l->is_PRECISION.interpolation[1]), 0, ivs, l, no_threading ) + / global_norm_PRECISION( &(l->is_PRECISION.interpolation[0]), 0, ivs, l, no_threading ); + test0_PRECISION("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) ); } if ( !l->next_level->idle ) - vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); - vector_PRECISION_distribute( vc2, vc1, l->next_level ); - vector_PRECISION_gather( vc3, vc2, l->next_level ); + vector_PRECISION_define_random( &vc[0], 0, civs, l->next_level ); + vector_PRECISION_distribute( &vc[1], &vc[0], l->next_level ); + vector_PRECISION_gather( &vc[2], &vc[1], l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); } test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff ); if ( !l->next_level->idle ) - vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); - interpolate3_PRECISION( vp1, vc1, l, no_threading ); - restrict_PRECISION( vc2, vp1, l, no_threading ); + vector_PRECISION_define_random( &vc[0], 0, civs, l->next_level ); + interpolate3_PRECISION( &vp[0], &vc[0], l, no_threading ); + restrict_PRECISION( &vc[1], &vp[0], l, no_threading ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[0], &vc[1], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) ); } END_LOCKED_MASTER(threading) if(threading->n_core>1) { - interpolate3_PRECISION( vp1, vc1, l, threading ); - restrict_PRECISION( vc2, vp1, l, threading ); + interpolate3_PRECISION( &vp[0], &vc[0], l, threading ); + restrict_PRECISION( &vc[1], &vp[0], l, threading ); START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[0], &vc[1], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -785,27 +625,27 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if (l->depth==0) - gamma5_PRECISION( vp2, vp1, l, no_threading ); + gamma5_PRECISION( &vp[1], &vp[0], l, no_threading ); else - coarse_gamma5_PRECISION( vp2, vp1, 0, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); - coarse_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level ); + coarse_gamma5_PRECISION( &vp[1], &vp[0], 0, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); + coarse_gamma5_PRECISION( &vc[2], &vc[1], 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } #ifdef HAVE_TM1p1 if(g.n_flavours == 2) { if (l->depth==0) - tau1_gamma5_PRECISION( vp2, vp1, l, no_threading ); + tau1_gamma5_PRECISION( &vp[1], &vp[0], l, no_threading ); else - coarse_tau1_gamma5_PRECISION( vp2, vp1, 0, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); - coarse_tau1_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level ); + coarse_tau1_gamma5_PRECISION( &vp[1], &vp[0], 0, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); + coarse_tau1_gamma5_PRECISION( &vc[2], &vc[1], 0, civs, l->next_level ); if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff ); } } @@ -813,32 +653,32 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr END_LOCKED_MASTER(threading) START_LOCKED_MASTER(threading) - vector_PRECISION_define( vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp[1], 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs ); + add_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.odd_proj, ivs ); else - coarse_add_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_add_block_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.odd_proj, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); - vector_PRECISION_scale( vc2, vc2, -1.0, 0, civs, l->next_level ); - coarse_add_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc[1], &vc[1], -1.0, 0, civs, l->next_level ); + coarse_add_block_diagonal_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) #ifdef HAVE_TM START_LOCKED_MASTER(threading) if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { - vector_PRECISION_define( vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp[1], 0, 0, ivs, l ); if (l->depth==0) - add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs ); + add_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_add_anti_block_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.tm_term, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); - vector_PRECISION_scale( vc2, vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); - coarse_add_anti_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc[1], &vc[1], -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level ); + coarse_add_anti_block_diagonal_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -848,16 +688,16 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if ( g.n_flavours == 2 && ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) { - vector_PRECISION_define( vp2, 0, 0, ivs, l ); + vector_PRECISION_define( &vp[1], 0, 0, ivs, l ); if (l->depth==0) - apply_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs ); + apply_doublet_coupling_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.epsbar_term, ivs ); else - coarse_add_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); - restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_add_doublet_coupling_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.epsbar_term, ivs, l ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); - vector_PRECISION_scale( vc2, vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); - coarse_add_doublet_coupling_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); - diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_scale( &vc[1], &vc[1], -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level ); + coarse_add_doublet_coupling_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading ); test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) @@ -865,30 +705,30 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if ( l->level > 0 ) { START_LOCKED_MASTER(threading) - interpolate3_PRECISION( vp1, vc1, l, no_threading ); + interpolate3_PRECISION( &vp[0], &vc[0], l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); #ifdef HAVE_TM if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) { - vector_PRECISION_scale( vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); + vector_PRECISION_scale( &vp[2], &vp[0], (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l ); if(l->depth == 0) - add_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs ); + add_diagonal_PRECISION( &vp[1], &vp[2], l->p_PRECISION.op->tm_term, ivs ); else - coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs, l ); + coarse_add_anti_block_diagonal_PRECISION( &vp[1], &vp[2], l->p_PRECISION.op->tm_term, ivs, l ); } #endif - restrict_PRECISION( vc2, vp2, l, no_threading ); + restrict_PRECISION( &vc[1], &vp[1], l, no_threading ); if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, no_threading ); + coarse_odd_even_PRECISION_test( &vc[2], &vc[0], l->next_level, no_threading ); else - apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); + apply_operator_PRECISION( &vc[2], &vc[0], &(l->next_level->p_PRECISION), l->next_level, no_threading ); - vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[1], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); @@ -901,14 +741,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if(threading->n_core>1) { if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) - coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, threading ); + coarse_odd_even_PRECISION_test( &vc[2], &vc[0], l->next_level, threading ); else - apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, threading ); + apply_operator_PRECISION( &vc[2], &vc[0], &(l->next_level->p_PRECISION), l->next_level, threading ); } START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { - vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); + vector_PRECISION_minus( &vc[2], &vc[1], &vc[2], 0, civs, l->next_level ); + diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { //TODO: this test doesn't work without SSE!! test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); } else { @@ -921,29 +761,33 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) { - vector_PRECISION_define_random( vp1, 0, ivs, l ); - block_to_oddeven_PRECISION( vp4, vp1, l, no_threading ); - coarse_diag_ee_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading ); - coarse_diag_oo_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading ); - coarse_hopping_term_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); - oddeven_to_block_PRECISION( vp4, vp3, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); - vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l ); - diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading ); + vector_PRECISION_define_random( &vp[0], 0, ivs, l ); + block_to_oddeven_PRECISION( &vp[3], &vp[0], l, no_threading ); + coarse_diag_ee_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), l, no_threading ); + coarse_diag_oo_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), l, no_threading ); + coarse_hopping_term_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); + oddeven_to_block_PRECISION( &vp[3], &vp[2], l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_minus( &vp[3], &vp[3], &vp[1], 0, ivs, l ); + diff = global_norm_PRECISION( &vp[3], 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp[1], 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff ); - block_to_oddeven_PRECISION( vp4, vp1, l, no_threading ); - coarse_odd_even_PRECISION_test( vp3, vp4, l, no_threading ); - oddeven_to_block_PRECISION( vp4, vp3, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); - vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l ); - diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading ); + block_to_oddeven_PRECISION( &vp[3], &vp[0], l, no_threading ); + coarse_odd_even_PRECISION_test( &vp[2], &vp[3], l, no_threading ); + oddeven_to_block_PRECISION( &vp[3], &vp[2], l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_minus( &vp[3], &vp[3], &vp[1], 0, ivs, l ); + diff = global_norm_PRECISION( &vp[3], 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp[1], 0, ivs, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff ); } + + for(int i=0; i<4; i++) + vector_PRECISION_free( &vp[i], l, threading ); - FREE( vp1, complex_PRECISION, 4*vs ); - FREE( vc1, complex_PRECISION, 3*cvs ); - END_LOCKED_MASTER(threading) + for(int i=0; i<3; i++) + vector_PRECISION_free( &vc[i], l->next_level, threading ); + + END_LOCKED_MASTER(threading) if ( g.method != 6 && l->next_level->level > 0 && !l->next_level->idle ) { schwarz_PRECISION_mvm_testfun( &(l->next_level->s_PRECISION), l->next_level, threading ); diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h index 3af0655..a33c594 100644 --- a/src/coarse_operator_generic.h +++ b/src/coarse_operator_generic.h @@ -22,53 +22,50 @@ #ifndef COARSE_OPERATOR_PRECISION_HEADER #define COARSE_OPERATOR_PRECISION_HEADER - #include "blas_vectorized.h" - struct Thread; void coarse_operator_PRECISION_alloc( level_struct *l ); void coarse_operator_PRECISION_free( level_struct *l ); - void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ); void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ); void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading ); void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void set_coarse_self_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2, + void set_coarse_self_coupling_PRECISION( vector_PRECISION *buffer1, vector_PRECISION *buffer2, vector_PRECISION *V, const int n, level_struct *l ); - void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2, + void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION *buffer1, vector_PRECISION *buffer2, vector_PRECISION *V, const int mu, const int n, level_struct *l ); - void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void coarse_self_couplings_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l ); - void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, + void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION clover, int length, level_struct *l ); - void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ); - void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ); - void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ); + void coarse_tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ); + void apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, + void g5D_apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, + void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, + void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ); - void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); + void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); - void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ); + void set_block_diagonal_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ); - void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l ); + void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION block, level_struct *l ); void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading ); // eta += D*phi, D stored columnwise - static inline void mv_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, - const vector_PRECISION phi, const register int n ) { + static inline void mv_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, + const buffer_PRECISION phi, const register int n ) { register int i, j, k=0; for ( i=0; inum_lattice_site_var, @@ -199,7 +196,7 @@ clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, clover_step_size2 = SQUARE(num_eig_vect); config_PRECISION clover_pt = clover; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A B , A=A*, D=D*, C = -B* // C D ] // storage order: upper triangle of A, upper triangle of D, B, columnwise @@ -257,13 +254,13 @@ } } - static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION block, int length, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A 0 , A=A*, D=D* diag. excluded // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise @@ -294,13 +291,13 @@ } } - static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION block, int length, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ A 0 , A=-A*, D=-D* diag. excluded // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise @@ -331,14 +328,14 @@ } } - static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION block, int length, level_struct *l ) { #ifdef HAVE_TM1p1 int num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length; // U(x) = [ 0 A , A=-A*, D=-D* diag. excluded // D 0 ] // storage order: upper triangle of A, upper triangle of D, columnwise @@ -360,11 +357,12 @@ #endif } - static inline void coarse_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -373,62 +371,63 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta += num_eig_vect;//2 - phi -= num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//0 D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - eta -= 3*num_eig_vect;//0 - phi += num_eig_vect;//2 + eta_pt -= 3*num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta += num_eig_vect;//2 - phi -= num_eig_vect;//2 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//2 D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - phi += num_eig_vect; - eta -= num_eig_vect; + phi_pt += num_eig_vect; + eta_pt -= num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -437,61 +436,62 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A* - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - eta -= num_eig_vect;//0 - phi += num_eig_vect;//2 + eta_pt -= num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta += num_eig_vect;//2 - phi -= 3*num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= 3*num_eig_vect;//0 D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - eta -= num_eig_vect;//2 - phi += num_eig_vect;//2 + eta_pt -= num_eig_vect;//2 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A* - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta += num_eig_vect; - phi -= num_eig_vect; + eta_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_n_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_n_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -500,61 +500,62 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta += num_eig_vect;//2 - phi -= num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//0 D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - eta -= 3*num_eig_vect;//0 - phi += num_eig_vect;//2 + eta_pt -= 3*num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta += num_eig_vect;//2 - phi -= num_eig_vect;//2 + eta_pt += num_eig_vect;//2 + phi_pt -= num_eig_vect;//2 D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // C - eta += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // B - phi += num_eig_vect; - eta -= num_eig_vect; + phi_pt += num_eig_vect; + eta_pt -= num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D - eta += num_eig_vect; + eta_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -563,164 +564,168 @@ #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // A* - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//1 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - eta -= num_eig_vect;//0 - phi += num_eig_vect;//2 + eta_pt -= num_eig_vect;//0 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//1 - phi += num_eig_vect;//3 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta += num_eig_vect;//2 - phi -= 3*num_eig_vect;//0 + eta_pt += num_eig_vect;//2 + phi_pt -= 3*num_eig_vect;//0 D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//1 - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - eta -= num_eig_vect;//2 - phi += num_eig_vect;//2 + eta_pt -= num_eig_vect;//2 + phi_pt += num_eig_vect;//2 D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - eta += num_eig_vect;//3 - phi += num_eig_vect;//3 - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); } else { #endif // A* - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -C* - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // -B* - eta += num_eig_vect; - phi -= num_eig_vect; + eta_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); // D* - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect ); #ifdef HAVE_TM1p1 } #endif } - static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A - mv_PRECISION( eta1, D, phi, num_eig_vect ); + mv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // C - eta1 += num_eig_vect; + eta1_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta1, D, phi, num_eig_vect ); + mv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // B - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta2, D, phi, num_eig_vect ); + mv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // D - eta2 += num_eig_vect; + eta2_pt += num_eig_vect; D += num_eig_vect2; - mv_PRECISION( eta2, D, phi, num_eig_vect ); + mv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } - static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, - num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A* - mvh_PRECISION( eta1, D, phi, num_eig_vect ); + mvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // -C* - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta2, D, phi, num_eig_vect ); + nmvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // -B* - eta1 += num_eig_vect; - phi -= num_eig_vect; + eta1_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta1, D, phi, num_eig_vect ); + nmvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // D* - eta2 += num_eig_vect; - phi += num_eig_vect; + eta2_pt += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta2, D, phi, num_eig_vect ); + mvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } - static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A - nmv_PRECISION( eta1, D, phi, num_eig_vect ); + nmv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // C - eta1 += num_eig_vect; + eta1_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta1, D, phi, num_eig_vect ); + nmv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // B - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta2, D, phi, num_eig_vect ); + nmv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // D - eta2 += num_eig_vect; + eta2_pt += num_eig_vect; D += num_eig_vect2; - nmv_PRECISION( eta2, D, phi, num_eig_vect ); + nmv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } - static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, config_PRECISION D, level_struct *l ) { + static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, config_PRECISION D, level_struct *l ) { int num_eig_vect = l->num_parent_eig_vect, - num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer; // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D // note: minus sign of D = self_coupling - hopping_term is added here // A* - nmvh_PRECISION( eta1, D, phi, num_eig_vect ); + nmvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // -C* - phi += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta2, D, phi, num_eig_vect ); + mvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); // -B* - eta1 += num_eig_vect; - phi -= num_eig_vect; + eta1_pt += num_eig_vect; + phi_pt -= num_eig_vect; D += num_eig_vect2; - mvh_PRECISION( eta1, D, phi, num_eig_vect ); + mvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect ); // D* - eta2 += num_eig_vect; - phi += num_eig_vect; + eta2_pt += num_eig_vect; + phi_pt += num_eig_vect; D += num_eig_vect2; - nmvh_PRECISION( eta2, D, phi, num_eig_vect ); + nmvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect ); } #endif diff --git a/src/coarsening_generic.c b/src/coarsening_generic.c index ae7893b..a3c6313 100644 --- a/src/coarsening_generic.c +++ b/src/coarsening_generic.c @@ -30,7 +30,7 @@ void interpolation_PRECISION_struct_init( interpolation_PRECISION_struct *is ) { is->test_vector = NULL; is->interpolation = NULL; is->eigenvalues = NULL; - is->tmp = NULL; + vector_PRECISION_init(&(is->tmp)); is->bootstrap_vector = NULL; is->bootstrap_eigenvalues = NULL; } diff --git a/src/data_generic.c b/src/data_generic.c index 950c814..c666644 100644 --- a/src/data_generic.c +++ b/src/data_generic.c @@ -22,7 +22,7 @@ #include "main.h" // vector storage for PRECISION precision -void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ) { +void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -39,7 +39,7 @@ void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int } -void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l ) { +void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -47,10 +47,29 @@ void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, l if ( phi != NULL ) { int i; for ( i=start; ivector_buffer[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I; } else { error0("Error in \"vector_PRECISION_define_random\": pointer is null\n"); } if(thread == 0 && start != end) PROF_PRECISION_STOP( _SET, 1 ); } + + +void vector_PRECISION_define_random_new( vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end( 0, (phi->size)*(phi->num_vect), &start, &end, l, threading ); + int thread = omp_get_thread_num(); + if(thread == 0) + PROF_PRECISION_START( _SET ); + if ( phi != NULL ) { + int i; + for ( i=start; ivector_buffer[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I; + } else { + error0("Error in \"vector_PRECISION_define_random\": pointer is null\n"); + } + if(thread == 0) + PROF_PRECISION_STOP( _SET, 1 ); +} diff --git a/src/data_generic.h b/src/data_generic.h index b236ab4..76fd875 100644 --- a/src/data_generic.h +++ b/src/data_generic.h @@ -22,7 +22,7 @@ #ifndef DATA_PRECISION_HEADER #define DATA_PRECISION_HEADER - void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ); - void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l ); - + void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); + void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l ); + void vector_PRECISION_define_random_new( vector_PRECISION *phi, level_struct *l, struct Thread *threading ); #endif diff --git a/src/dirac.c b/src/dirac.c index 068e8a7..8d85319 100644 --- a/src/dirac.c +++ b/src/dirac.c @@ -44,14 +44,14 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { #ifdef HAVE_TM if ( g.mu + g.mu_even_shift == 0 && g.mu + g.mu_odd_shift == 0 ) - vector_double_define( op->tm_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); + buffer_double_define( op->tm_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); else tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, op, l, no_threading ); #endif #ifdef HAVE_TM1p1 if ( g.epsbar == 0 && g.epsbar_ig5_even_shift == 0 && g.epsbar_ig5_odd_shift == 0 ) - vector_double_define( op->epsbar_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); + buffer_double_define( op->epsbar_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); else epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, op, l, no_threading ); #endif @@ -86,7 +86,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { mat_free( &Qstore, 3 ); spin_free( 4, 4 ); } else { - vector_double_define( op->clover, 4+op->m0, 0, l->inner_vector_size, l ); + buffer_double_define( op->clover, 4+op->m0, 0, l->inner_vector_size, l ); } } @@ -436,7 +436,7 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { int t, z, y, x, mu, nu, *ll = l->local_lattice, ls[4], le[4]; long int i, j, send_size, max_size; - vector_double buffer1 = NULL, buffer2 = NULL, buffer3 = NULL, buffer4 = NULL; + buffer_double buffer1 = NULL, buffer2 = NULL, buffer3 = NULL, buffer4 = NULL; max_size = 0; for ( mu=0; mu<4; mu++ ) { @@ -528,11 +528,11 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) { send_size = i; ASSERT(send_size<=max_size); } - - FREE( buffer1, complex_double, max_size ); - FREE( buffer2, complex_double, max_size ); - FREE( buffer3, complex_double, max_size ); + FREE( buffer1, complex_double, max_size ); + FREE( buffer2, complex_double, max_size ); + FREE( buffer3, complex_double, max_size ); FREE( buffer4, complex_double, max_size ); + } diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 6be33de..ab420ff 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -21,12 +21,12 @@ #include "main.h" -void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, +void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ) { int nv = l->num_lattice_site_var; - vector_PRECISION lphi = phi+start, leta = eta+start; - vector_PRECISION leta_end = eta+end; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; + buffer_PRECISION leta_end = eta->vector_buffer+end; #ifdef PROFILING START_MASTER(threading) @@ -79,9 +79,6 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC #endif } else { - -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - config_PRECISION clover = op->clover+(start/nv)*42; #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -123,28 +120,153 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC #ifdef HAVE_TM1p1 } #endif - -#else + } #ifdef HAVE_TM1p1 - PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized; -#else - PRECISION *clover = op->clover_vectorized; -#endif - clover += start*12; - while ( leta < leta_end ) { // tm_term included in the clover vectorized - sse_site_clover_PRECISION( (PRECISION*) leta, (PRECISION*) lphi, clover ); - leta += nv; lphi += nv; - clover += 12*nv; + config_PRECISION eps_term = op->epsbar_term+(start/nv)*12; + lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; + if ( g.n_flavours == 2 && + ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) + while ( leta < leta_end ) { + lphi += 6; + FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; ) + lphi -= 12; + eps_term -= 6; + FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; ) + lphi += 6; } - +#endif + + +#ifdef PROFILING + START_MASTER(threading) + PROF_PRECISION_STOP( _SC, 1 ); + END_MASTER(threading) #endif +} + + +void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, + level_struct *l, struct Thread *threading ) { + + int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j, jj; + buffer_PRECISION lphi = phi->vector_buffer+start*n_vect, leta = eta->vector_buffer+start*n_vect; + buffer_PRECISION leta_end = eta->vector_buffer+end*n_vect; +#ifdef PROFILING + START_MASTER(threading) + PROF_PRECISION_START( _SC ); + END_MASTER(threading) +#endif + +#ifdef HAVE_TM + config_PRECISION tm_term = op->tm_term+(start/nv)*12; +#endif + + if ( g.csw == 0.0 ) { + + config_PRECISION clover = op->clover+(start/nv)*12; +/*#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + clover -= 6; + tm_term -= 6; + FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + clover -= 6; + tm_term -= 6; + FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + } + else +#endif + while ( leta < leta_end ) { + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + clover -= 6; + FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + clover -= 6; + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + } + } else { +#endif*/ +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { + while ( leta < leta_end ) + for( i=0; i<12; i++ ) { + VECTOR_LOOP(j, n_vect, jj, *leta = (*lphi)*((*clover)+(*tm_term)); + leta++; + lphi++;) + clover++; + tm_term++; + } + }// else +#endif + while ( leta < leta_end ) + for( i=0; i<12; i++ ){ + VECTOR_LOOP(j, n_vect, jj, *leta = (*lphi)*(*clover); + leta++; + lphi++;) + clover++; } +/*#ifdef HAVE_TM1p1 + } +#endif*/ + + } else { + config_PRECISION clover = op->clover+(start/nv)*42; +/*#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + doublet_site_clover_PRECISION( leta, lphi, clover ); + clover+=42; + FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + tm_term -= 6; + FOR6( *leta -=(*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + tm_term -= 6; + FOR6( *leta -= (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + } + else +#endif + while ( leta < leta_end ) { + doublet_site_clover_PRECISION( leta, lphi, clover ); + leta+=24; lphi+=24; + clover+=42; + } + } else { +#endif*/ +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + site_clover_PRECISION_new( leta, lphi, clover ); + for( i=0; i<12; i++ ){ + VECTOR_LOOP(j, n_vect, jj, *leta += (*lphi)*(*tm_term); + leta++; + lphi++;) + tm_term++; + } + clover+=42; + } + // else +#endif + while ( leta < leta_end ) { + site_clover_PRECISION_new( leta, lphi, clover ); + leta+=12*n_vect; lphi+=12*n_vect; + clover+=42; + } +/*#ifdef HAVE_TM1p1 + } +#endif */ + } +/* #ifdef HAVE_TM1p1 config_PRECISION eps_term = op->epsbar_term+(start/nv)*12; - lphi = phi+start, leta = eta+start; + lphi = phi->vector_buffer+start+phi_shift, leta = eta->vector_buffer+start+eta_shift; if ( g.n_flavours == 2 && ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) while ( leta < leta_end ) { @@ -156,7 +278,7 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC lphi += 6; } #endif - +*/ #ifdef PROFILING START_MASTER(threading) @@ -166,59 +288,52 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC } -static void spin0and1_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) { + +static void spin0and1_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { - vector_PRECISION eta_end = eta + l->inner_vector_size; + buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer; if ( g.csw == 0.0 ) { - while ( eta < eta_end ) { - FOR6( *eta = (*phi)*(*clover); eta++; phi++; clover++; ) - FOR6( *eta = _COMPLEX_PRECISION_ZERO; eta++; ) - phi+=6; clover+=6; + while ( leta < eta_end ) { + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ) + FOR6( *leta = _COMPLEX_PRECISION_ZERO; leta++; ) + lphi+=6; clover+=6; } } else { - while ( eta < eta_end ) { - spin0and1_site_clover_PRECISION( eta, phi, clover ); - eta+=12; phi+=12; clover+=42; + while ( leta < eta_end ) { + spin0and1_site_clover_PRECISION( leta, lphi, clover ); + leta+=12; lphi+=12; clover+=42; } } } -static void spin2and3_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) { +static void spin2and3_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) { - vector_PRECISION eta_end = eta + l->inner_vector_size; + buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer; if ( g.csw == 0.0 ) { - while ( eta < eta_end ) { - phi+=6; clover+=6; - FOR6( *eta = _COMPLEX_PRECISION_ZERO; eta++; ) - FOR6( *eta = (*phi)*(*clover); eta++; phi++; clover++; ) + while ( leta < eta_end ) { + lphi+=6; clover+=6; + FOR6( *leta = _COMPLEX_PRECISION_ZERO; leta++; ) + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ) } } else { - while ( eta < eta_end ) { - spin2and3_site_clover_PRECISION( eta, phi, clover ); - eta +=12; phi+=12; clover+=42; + while ( leta < eta_end ) { + spin2and3_site_clover_PRECISION( leta, lphi, clover ); + leta +=12; lphi+=12; clover+=42; } } } -void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { +void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) int n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; - vector_PRECISION lphi = phi+start, leta = eta+start; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; // clover term clover_PRECISION(eta, phi, &(s->op), start, start+nv*n, l, no_threading ); -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // block operator vectorized just in the float environment - PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; - for ( int mu=0; mu<4; mu++ ) { - block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); - } -#else int i, j, k, *ind; config_PRECISION D_pt; config_PRECISION D = s->op.D + (start/nv)*36; @@ -344,28 +459,22 @@ void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } #ifdef HAVE_TM1p1 } -#endif #endif END_UNTHREADED_FUNCTION(threading) } -void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; -#else int i, j, *nb_pt; - vector_PRECISION phi_pt, eta_pt, end_pt; + buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; -#endif compute_core_start_end(0, nv*n, &start, &end, l, threading ); SYNC_MASTER_TO_ALL(threading) - clover_PRECISION(eta, phi, op, start, end, l, threading ); + clover_PRECISION( eta, phi, op, start, end, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _NC ); @@ -373,17 +482,13 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi, start, end ); -#else complex_PRECISION pbuf[12]; - for ( i=start/2, phi_pt=phi+start; ivector_buffer+start; iprnT+i, phi_pt ); dprp_Z_PRECISION( op->prnZ+i, phi_pt ); dprp_Y_PRECISION( op->prnY+i, phi_pt ); dprp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -392,11 +497,8 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); END_LOCKED_MASTER(threading) -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi, op, neighbor, start, end ); -#else // project plus dir and multiply with U dagger - for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptvector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptprpX+j+6, D_pt, pbuf+6 ); mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9; } -#endif // start communication in positive direction START_LOCKED_MASTER(threading) @@ -441,11 +542,8 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); END_LOCKED_MASTER(threading) -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_dpbp_PRECISION( eta, prn, op, neighbor, start, end ); -#else // multiply with U and lift up minus dir - for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptvector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnT+j ); @@ -475,7 +573,6 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 ); dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -486,30 +583,22 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta, prp, start, end ); -#else - for ( i=start/2, eta_pt=eta+start; ivector_buffer+start; iprpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif } else { #endif -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi, start, end ); -#else complex_PRECISION pbuf[6]; - for ( i=start/2, phi_pt=phi+start; ivector_buffer+start; iprnT+i, phi_pt ); prp_Z_PRECISION( op->prnZ+i, phi_pt ); prp_Y_PRECISION( op->prnY+i, phi_pt ); prp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -519,10 +608,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi, op, neighbor, start, end ); -#else - for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptvector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpX+j, D_pt, pbuf ); mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9; } -#endif // start communication in positive direction START_LOCKED_MASTER(threading) @@ -560,10 +645,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_pbp_PRECISION( eta, prn, op, neighbor, start, end ); -#else - for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptvector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnT+j ); @@ -585,7 +667,6 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -596,16 +677,12 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta, prp, start, end ); -#else - for ( i=start/2, eta_pt=eta+start; ivector_buffer+start; iprpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif #ifdef HAVE_TM1p1 } #endif @@ -618,48 +695,282 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat } -void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { + + int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var, n_vect = g.num_rhs_vect; + int i, j, *nb_pt; + buffer_PRECISION phi_pt, eta_pt, end_pt; + config_PRECISION D_pt; + //int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec; + + compute_core_start_end(0, nv*n, &start, &end, l, threading ); + + //vector_PRECISION_change_layout( phi, phi, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( eta, eta, _LV_SV_NV, no_threading ); + + SYNC_MASTER_TO_ALL(threading) + clover_PRECISION_new( eta, phi, op, start, end, l, threading ); + START_MASTER(threading) + PROF_PRECISION_START( _NC ); + END_MASTER(threading) +/* +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + complex_PRECISION pbuf[12]; + for ( i=start/2, phi_pt=phi->vector_buffer+start+phi_shift; iprnT+i, phi_pt ); + dprp_Z_PRECISION( op->prnZ+i, phi_pt ); + dprp_Y_PRECISION( op->prnY+i, phi_pt ); + dprp_X_PRECISION( op->prnX+i, phi_pt ); + } + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + // project plus dir and multiply with U dagger + for ( phi_pt=phi->vector_buffer+start+phi_shift,c end_pt=phi->vector_buffer+end+phi_shift, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptprpT+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpT+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpT+j+9, D_pt, pbuf+9 ); D_pt += 9; + // Z dir + j = nv/2*(*nb_pt); nb_pt++; + dprn_Z_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpZ+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpZ+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpZ+j+9, D_pt, pbuf+9 ); D_pt += 9; + // Y dir + j = nv/2*(*nb_pt); nb_pt++; + dprn_Y_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpY+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpY+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpY+j+9, D_pt, pbuf+9 ); D_pt += 9; + // X dir + j = nv/2*(*nb_pt); nb_pt++; + dprn_X_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpX+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpX+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9; + } + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // multiply with U and lift up minus dir + for ( eta_pt=eta->vector_buffer+start+eta_shift, end_pt=eta->vector_buffer+end+eta_shift, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnT+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnT+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnT+j+9 ); + dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Z dir + j = nv/2*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnZ+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnZ+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnZ+j+9 ); + dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Y dir + j = nv/2*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnY+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnY+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnY+j+9 ); + dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + // X dir + j = nv/2*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnX+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnX+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 ); + dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + } + + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // lift up plus dir + for ( i=start/2, eta_pt=eta->vector_buffer+start+eta_shift; iprpT+i, eta_pt ); + dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); + dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); + dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + } + } else { +#endif +*/ + complex_PRECISION pbuf[6*n_vect]; + for ( i=start*n_vect/2, phi_pt=phi->vector_buffer+start*n_vect; iprnT+i, phi_pt ); + prp_Z_PRECISION_new( op->prnZ+i, phi_pt ); + prp_Y_PRECISION_new( op->prnY+i, phi_pt ); + prp_X_PRECISION_new( op->prnX+i, phi_pt ); + } + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // project plus dir and multiply with U dagger + for ( phi_pt=phi->vector_buffer+start*n_vect, end_pt=phi->vector_buffer+end*n_vect, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpT+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpT+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; + // Z dir + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_Z_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpZ+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpZ+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; + // Y dir + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_Y_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpY+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpY+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; + // X dir + j = 6*(*nb_pt)*n_vect; nb_pt++; + prn_X_PRECISION_new( pbuf, phi_pt ); + mvmh_PRECISION_new( op->prpX+j, D_pt, pbuf ); + mvmh_PRECISION_new( op->prpX+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9; + } + + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // multiply with U and lift up minus dir + for ( eta_pt=eta->vector_buffer+start*n_vect, end_pt=eta->vector_buffer+end*n_vect, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnT+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnT+j+3*n_vect ); + pbp_su3_T_PRECISION_new( pbuf, eta_pt ); D_pt += 9; + // Z dir + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnZ+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnZ+j+3*n_vect ); + pbp_su3_Z_PRECISION_new( pbuf, eta_pt ); D_pt += 9; + // Y dir + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnY+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnY+j+3*n_vect ); + pbp_su3_Y_PRECISION_new( pbuf, eta_pt ); D_pt += 9; + // X dir + j = 6*(*nb_pt)*n_vect; nb_pt++; + mvm_PRECISION_new( pbuf, D_pt, op->prnX+j ); + mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnX+j+3*n_vect ); + pbp_su3_X_PRECISION_new( pbuf, eta_pt ); D_pt += 9; + } + + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // lift up plus dir + for ( i=start*n_vect/2, eta_pt=eta->vector_buffer+start*n_vect; iprpT+i, eta_pt ); + pbn_su3_Z_PRECISION_new( op->prpZ+i, eta_pt ); + pbn_su3_Y_PRECISION_new( op->prpY+i, eta_pt ); + pbn_su3_X_PRECISION_new( op->prpX+i, eta_pt ); + } +/*#ifdef HAVE_TM1p1 + } +#endif*/ + + //vector_PRECISION_change_layout( phi, phi, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( eta, eta, _NV_LV_SV, no_threading ); + + START_MASTER(threading) + PROF_PRECISION_STOP( _NC, 1 ); + END_MASTER(threading) + + SYNC_MASTER_TO_ALL(threading) +} + + + +void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - while ( eta < eta_end ) { - FOR12( *eta = -(*phi); phi++; eta++; ) - FOR12( *eta = (*phi); phi++; eta++; ) + while ( leta < eta_end ) { + FOR12( *leta = -(*lphi); lphi++; leta++; ) + FOR12( *leta = (*lphi); lphi++; leta++; ) } } else #endif - while ( eta < eta_end ) { - FOR6( *eta = -(*phi); phi++; eta++; ) - FOR6( *eta = (*phi); phi++; eta++; ) + while ( leta < eta_end ) { + FOR6( *leta = -(*lphi); lphi++; leta++; ) + FOR6( *leta = (*lphi); lphi++; leta++; ) } } -void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; complex_PRECISION b[6]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; - while ( eta < eta_end ) { + while ( leta < eta_end ) { int i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta = - b[i] ; eta++; i++; ); + FOR6( *leta = - b[i] ; leta++; i++; ); i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta = b[i] ; eta++; i++; ); + FOR6( *leta = b[i] ; leta++; i++; ); } } else #endif @@ -671,100 +982,97 @@ void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_st } } -void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD) { - FOR24( *eta = (*phi); phi++; eta++; ); + FOR24( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN) { - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD) { - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR12( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } } -void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = -(*phi); phi++; eta++; ); - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *leta = -(*lphi); lphi++; leta++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD){ - FOR6( *eta = -(*phi); phi++; eta++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR12( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } } -void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; complex_PRECISION b[6]; - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_ODD){ int i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta = - b[i] ; eta++; i++; ); + FOR6( *leta = - b[i] ; leta++; i++; ); i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta = b[i] ; eta++; i++; ); + FOR6( *leta = b[i] ; leta++; i++; ); } else if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } @@ -778,96 +1086,93 @@ void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECIS } } -void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = (*phi); phi++; eta++; ); + FOR24( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = 0; phi++; eta++; ); + FOR24( *leta = 0; lphi++; leta++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD) { - FOR12( *eta = 0; phi++; eta++; ); + FOR12( *leta = 0; lphi++; leta++; ); } i++; } } -void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = -(*phi); phi++; eta++; ); - FOR12( *eta = (*phi); phi++; eta++; ); + FOR12( *leta = -(*lphi); lphi++; leta++; ); + FOR12( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = 0; phi++; eta++; ); + FOR24( *leta = 0; lphi++; leta++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR6( *eta = -(*phi); phi++; eta++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = 0; phi++; eta++; ); + FOR12( *leta = 0; lphi++; leta++; ); } i++; } } -void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { +void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) { ASSERT(l->depth == 0); #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; complex_PRECISION b[6]; - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ int i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = -(*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta = - b[i] ; eta++; i++; ); + FOR6( *leta = - b[i] ; leta++; i++; ); i = 0; - FOR6( b[i] = (*phi); phi++; i++; ); - FOR6( *eta = (*phi); phi++; eta++; ); + FOR6( b[i] = (*lphi); lphi++; i++; ); + FOR6( *leta = (*lphi); lphi++; leta++; ); i = 0; - FOR6( *eta = b[i] ; eta++; i++; ); + FOR6( *leta = b[i] ; leta++; i++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; ); } i++; } @@ -881,40 +1186,39 @@ void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISI } } -void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, +void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, complex_double even, complex_double odd, level_struct *l, struct Thread *threading ) { int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; + buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth]; + buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth]; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN){ - FOR24( *eta = even*(*phi); phi++; eta++; ); + FOR24( *leta = even*(*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD){ - FOR24( *eta = odd*(*phi); phi++; eta++; ); + FOR24( *leta = odd*(*lphi); lphi++; leta++; ); } i++; } else #endif - while ( eta < eta_end ) { + while ( leta < eta_end ) { if(g.odd_even_table[i]==_EVEN) { - FOR12( *eta = even*(*phi); phi++; eta++; ); + FOR12( *leta = even*(*lphi); lphi++; leta++; ); } else if(g.odd_even_table[i]==_ODD) { - FOR12( *eta = odd*(*phi); phi++; eta++; ); + FOR12( *leta = odd*(*lphi); lphi++; leta++; ); } i++; } } -void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) { +void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 @@ -924,26 +1228,27 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION * spin2and3 of flav1 * spin2and3 of flav2 */ - vector_PRECISION serial_end; + buffer_PRECISION serial_end; + buffer_PRECISION serial_pt = serial->vector_buffer, flav1_pt = flav1->vector_buffer, flav2_pt = flav2->vector_buffer; if( g.n_flavours == 2 ) { - serial_end = serial + threading->end_index[l->depth]; - serial += threading->start_index[l->depth]; - flav1 += threading->start_index[l->depth]/2; - flav2 += threading->start_index[l->depth]/2; + serial_end = serial->vector_buffer + threading->end_index[l->depth]; + serial_pt += threading->start_index[l->depth]; + flav1_pt += threading->start_index[l->depth]/2; + flav2_pt += threading->start_index[l->depth]/2; } else { - serial_end = serial + threading->end_index[l->depth]*2; - serial += threading->start_index[l->depth]*2; - flav1 += threading->start_index[l->depth]; - flav2 += threading->start_index[l->depth]; + serial_end = serial->vector_buffer + threading->end_index[l->depth]*2; + serial_pt += threading->start_index[l->depth]*2; + flav1_pt += threading->start_index[l->depth]; + flav2_pt += threading->start_index[l->depth]; } - while ( serial < serial_end ) { - FOR6( *serial = (*flav1); serial++; flav1++; ) - FOR6( *serial = (*flav2); serial++; flav2++; ) - FOR6( *serial = (*flav1); serial++; flav1++; ) - FOR6( *serial = (*flav2); serial++; flav2++; ) + while ( serial_pt < serial_end ) { + FOR6( *serial_pt = (*flav1_pt); serial_pt++; flav1_pt++; ) + FOR6( *serial_pt = (*flav2_pt); serial_pt++; flav2_pt++; ) + FOR6( *serial_pt = (*flav1_pt); serial_pt++; flav1_pt++; ) + FOR6( *serial_pt = (*flav2_pt); serial_pt++; flav2_pt++; ) } #else START_MASTER(threading) @@ -953,29 +1258,30 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION } -void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) { +void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 - vector_PRECISION serial_end; - + buffer_PRECISION serial_end; + buffer_PRECISION serial_pt = serial->vector_buffer, flav1_pt = flav1->vector_buffer, flav2_pt = flav2->vector_buffer; + if( g.n_flavours == 2 ) { - serial_end = serial + threading->end_index[l->depth]; - serial += threading->start_index[l->depth]; - flav1 += threading->start_index[l->depth]/2; - flav2 += threading->start_index[l->depth]/2; + serial_end = serial->vector_buffer + threading->end_index[l->depth]; + serial_pt += threading->start_index[l->depth]; + flav1_pt += threading->start_index[l->depth]/2; + flav2_pt += threading->start_index[l->depth]/2; } else { - serial_end = serial + threading->end_index[l->depth]*2; - serial += threading->start_index[l->depth]*2; - flav1 += threading->start_index[l->depth]; - flav2 += threading->start_index[l->depth]; + serial_end = serial->vector_buffer + threading->end_index[l->depth]*2; + serial_pt += threading->start_index[l->depth]*2; + flav1_pt += threading->start_index[l->depth]; + flav2_pt += threading->start_index[l->depth]; } - while ( serial < serial_end ) { - FOR6( *flav1 = (*serial); serial++; flav1++; ) - FOR6( *flav2 = (*serial); serial++; flav2++; ) - FOR6( *flav1 = (*serial); serial++; flav1++; ) - FOR6( *flav2 = (*serial); serial++; flav2++; ) + while ( serial_pt < serial_end ) { + FOR6( *flav1_pt = (*serial_pt); serial_pt++; flav1_pt++; ) + FOR6( *flav2_pt = (*serial_pt); serial_pt++; flav2_pt++; ) + FOR6( *flav1_pt = (*serial_pt); serial_pt++; flav1_pt++; ) + FOR6( *flav2_pt = (*serial_pt); serial_pt++; flav2_pt++; ) } #else START_MASTER(threading) @@ -985,28 +1291,28 @@ void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION } -void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { d_plus_clover_PRECISION( eta, phi, op, l, threading ); SYNC_CORES(threading) gamma5_PRECISION( eta, eta, l, threading ); SYNC_CORES(threading) } -void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ) { +void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ) { - vector_PRECISION eta_end = eta1 + l->inner_vector_size; - - while ( eta1 < eta_end ) { - FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ); - FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ); + buffer_PRECISION eta_end = eta1->vector_buffer + l->inner_vector_size; + buffer_PRECISION eta1_pt = eta1->vector_buffer, eta2_pt = eta2->vector_buffer, phi_pt = phi->vector_buffer; + while ( eta1_pt < eta_end ) { + FOR6( *eta1_pt = (*phi_pt)*(*diag); *eta2_pt = _COMPLEX_PRECISION_ZERO; eta1_pt++; eta2_pt++; phi_pt++; diag++; ); + FOR6( *eta2_pt = (*phi_pt)*(*diag); *eta1_pt = _COMPLEX_PRECISION_ZERO; eta1_pt++; eta2_pt++; phi_pt++; diag++; ); } } -void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l ) { +void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ) { int i, length, index1, index2, *index_dir, *neighbor = s->op.neighbor_table; - vector_PRECISION eta1_pt, eta2_pt, phi_pt; + buffer_PRECISION eta1_pt, eta2_pt, phi_pt; complex_PRECISION buffer1[12], buffer2[12]; config_PRECISION D_pt, D = s->op.D; @@ -1018,84 +1324,84 @@ void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION length = l->is_PRECISION.agg_length[T]; index_dir = l->is_PRECISION.agg_index[T]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*T; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_T_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_T_PRECISION( eta1_pt, eta2_pt, buffer2 ); } // Z dir length = l->is_PRECISION.agg_length[Z]; index_dir = l->is_PRECISION.agg_index[Z]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*Z; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_Z_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_Z_PRECISION( eta1_pt, eta2_pt, buffer2 ); } // Y dir length = l->is_PRECISION.agg_length[Y]; index_dir = l->is_PRECISION.agg_index[Y]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*Y; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_Y_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_Y_PRECISION( eta1_pt, eta2_pt, buffer2 ); } // X dir length = l->is_PRECISION.agg_length[X]; index_dir = l->is_PRECISION.agg_index[X]; for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1+9*X; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - phi_pt = phi + 12*index1; + phi_pt = phi->vector_buffer + 12*index1; mvmh_PRECISION( buffer2, D_pt, phi_pt ); mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 ); mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 ); mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin_p_X_PRECISION( eta1_pt, eta2_pt, buffer1 ); - eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2; + eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2; twospin_n_X_PRECISION( eta1_pt, eta2_pt, buffer2 ); } } -void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { +void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { int i, length, index1, index2, *index_dir, *neighbor; - vector_PRECISION eta1_pt, eta2_pt, phi_pt; + buffer_PRECISION eta1_pt, eta2_pt, phi_pt; complex_PRECISION buffer1[12]; config_PRECISION D_pt, D = s->op.D; @@ -1108,54 +1414,54 @@ void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta // T dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*T; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_T_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } else if ( mu == Z ) { // Z dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*Z; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_Z_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } else if ( mu == Y ) { // Y dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*Y; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_Y_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } else if ( mu == X ) { // X dir for ( i=0; ivector_buffer + 12*index2; D_pt = D + 36*index1 + 9*X; mvm_PRECISION( buffer1, D_pt, phi_pt ); mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 ); mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 ); mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 ); - eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1; + eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1; twospin2_p_X_PRECISION( eta1_pt, eta2_pt, buffer1 ); } } } -void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l) { +void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l) { int t, z, y, x, i; int *gl=l->global_lattice, sl[4]; double phase[4]; @@ -1174,10 +1480,46 @@ void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISIO twisted_bc = exp(I*phase[X]); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { - FOR24( *eta = (*phi)*twisted_bc; phi++; eta++; ); + FOR24( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ); } else #endif - { FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; ) } + { FOR12( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ) } + } + } + } + } +} + +void apply_twisted_bc_to_vector_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l) { + int t, z, y, x, i, j; + int n_vect=g.num_rhs_vect; + int *gl=l->global_lattice, sl[4]; + double phase[4]; + complex_double twisted_bc; + for (i=0; i<4; i++) + sl[i] = l->local_lattice[i]*g.my_coords[i]; + + for (t=0; tlocal_lattice[0]; t++) { + phase[T] = theta[T]*((double)sl[T]+t)/(double)gl[T]; + for (z=0; zlocal_lattice[1]; z++) { + phase[Z] = phase[T] + theta[Z]*((double)sl[Z]+z)/(double)gl[Z]; + for (y=0; ylocal_lattice[2]; y++) { + phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y]; + for (x=0; xlocal_lattice[3]; x++) { + phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X]; + twisted_bc = exp(I*phase[X]); +/*#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + FOR24( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ); + } else +#endif*/ + for (i=0; i<12; i++){ + for(j=0; jvector_buffer = (*phi->vector_buffer)*twisted_bc; + phi->vector_buffer++; + eta->vector_buffer++; + } + } } } } @@ -1188,13 +1530,9 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); - START_LOCKED_MASTER(threading) -#else START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif + conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( !l->next_level->idle && l->next_level->level > 0 ) { @@ -1457,73 +1795,81 @@ void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l #ifdef HAVE_TM1p1 double diff; - vector_double vd1=NULL, vd2, vd3, vd4, vdd1, vdd2, vdd3, vdd4; - vector_PRECISION vpp1=NULL, vpp2; + vector_double vd[4], vdd[4]; + vector_PRECISION vpp[2]; + + for(int i=0; i<4; i++){ + vector_double_init( &vd[i] ); + vector_double_alloc( &vd[i], _INNER, 1, l, threading ); + vector_double_init( &vdd[i] ); + vector_double_alloc( &vdd[i], _INNER, 2, l, threading ); + } + + for(int i=0; i<2; i++){ + vector_PRECISION_init( &vpp[i] ); + vector_PRECISION_alloc( &vpp[i], _INNER, 2, l, threading ); + } ASSERT(g.n_flavours==2); data_layout_n_flavours( 1, l, threading ); - int ivs = l->inner_vector_size; - - PUBLIC_MALLOC( vd1, complex_double, 4*ivs + 2*4*ivs ); - PUBLIC_MALLOC( vpp1, complex_PRECISION, 2*2*ivs ); - - vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; - vdd1 = vd4 + ivs; vdd2 = vdd1 + 2*ivs; vdd3 = vdd2 + 2*ivs; vdd4 = vdd3 + 2*ivs; - vpp2 = vpp1 + 2*ivs; - START_LOCKED_MASTER(threading) - vector_double_define_random( vd1, 0, l->inner_vector_size, l ); - vector_double_define_random( vd2, 0, l->inner_vector_size, l ); - apply_operator_double( vd3, vd1, &(g.p), l, no_threading ); + vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); + vector_double_define_random( &vd[1], 0, l->inner_vector_size, l ); + apply_operator_double( &vd[2], &vd[0], &(g.p), l, no_threading ); #ifdef HAVE_TM - vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); + buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); #endif - apply_operator_double( vd4, vd2, &(g.p), l, no_threading ); + apply_operator_double( &vd[3], &vd[1], &(g.p), l, no_threading ); #ifdef HAVE_TM - vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); + buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); #endif - add_diagonal_double( vd3, vd2, g.op_double.epsbar_term, l->inner_vector_size ); - add_diagonal_double( vd4, vd1, g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( &vd[2], &vd[1], g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( &vd[3], &vd[0], g.op_double.epsbar_term, l->inner_vector_size ); - two_flavours_to_serial_double( vd1, vd2, vdd1, l, no_threading ); - two_flavours_to_serial_double( vd3, vd4, vdd2, l, no_threading ); + two_flavours_to_serial_double( &vd[0], &vd[1], &vdd[0], l, no_threading ); + two_flavours_to_serial_double( &vd[2], &vd[3], &vdd[1], l, no_threading ); END_LOCKED_MASTER(threading) data_layout_n_flavours( 2, l, threading ); START_LOCKED_MASTER(threading) - trans_PRECISION( vpp1, vdd1, op->translation_table, l, no_threading ); - apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, no_threading ); + trans_PRECISION( &vpp[0], &vdd[0], op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vpp[1], &vpp[0], &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION( &vdd[2], &vpp[1], op->translation_table, l, no_threading ); - vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) / - global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading ); + vector_double_minus( &vdd[3], &vdd[2], &vdd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vdd[3], 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( &vdd[2], 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - trans_PRECISION( vpp1, vdd1, op->translation_table, l, threading ); - apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, threading ); - trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, threading ); + trans_PRECISION( &vpp[0], &vdd[0], op->translation_table, l, threading ); + apply_operator_PRECISION( &vpp[1], &vpp[0], &(l->p_PRECISION), l, threading ); + trans_back_PRECISION( &vdd[2], &vpp[1], op->translation_table, l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) START_LOCKED_MASTER(threading) - vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) / - global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading ); + vector_double_minus( &vdd[3], &vdd[2], &vdd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vdd[3], 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( &vdd[2], 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION with threading: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) } - PUBLIC_FREE( vd1, complex_double, 4*ivs + 2*4*ivs ); - PUBLIC_FREE( vpp1, complex_PRECISION, 2*2*ivs ); + for(int i=0; i<4; i++){ + vector_double_free( &vd[i], l, threading ); + vector_double_free( &vdd[i], l, threading ); + } + + for(int i=0; i<2; i++) + vector_PRECISION_free( &vpp[i], l, threading ); START_LOCKED_MASTER(threading) if ( g.method >=4 && g.odd_even ) diff --git a/src/dirac_generic.h b/src/dirac_generic.h index 1224f78..0d76a73 100644 --- a/src/dirac_generic.h +++ b/src/dirac_generic.h @@ -24,20 +24,25 @@ struct Thread; - void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ); - void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ); + void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ); + void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ); - void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); + void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); + void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); + + void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void d_plus_clover_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); + void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ); + void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ); + void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); - void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ); - void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l ); - void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); - void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l); + void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l); + void apply_twisted_bc_to_vector_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l); + void operator_updates_PRECISION( level_struct *l, struct Thread *threading ); void m0_update_PRECISION( PRECISION m0,operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op, @@ -46,22 +51,22 @@ level_struct *l, struct Thread *threading ); void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, + void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); + void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, complex_double even, complex_double odd, level_struct *l, struct Thread *threading ); - static inline void add_diagonal_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, + static inline void add_diagonal_PRECISION( const vector_PRECISION *eta, const vector_PRECISION *phi, const config_PRECISION diag, const int length ) { config_PRECISION diag_pt = diag; - vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length; + buffer_PRECISION phi_pt = phi->vector_buffer, eta_pt = eta->vector_buffer, eta_end = eta->vector_buffer + length; #ifdef HAVE_TM1p1 if(g.n_flavours == 2) while ( eta_pt < eta_end ) { @@ -79,10 +84,10 @@ } #ifdef HAVE_TM1p1 - static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, + static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION *eta, const vector_PRECISION *phi, const config_PRECISION diag, const int length ) { config_PRECISION diag_pt = diag; - vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length; + buffer_PRECISION phi_pt = phi->vector_buffer, eta_pt = eta->vector_buffer, eta_end = eta->vector_buffer + length; while ( eta_pt < eta_end ) { phi_pt += 6; FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) @@ -95,7 +100,7 @@ #endif // eta = D*phi - static inline void mvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void mvm_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = D[0]*phi[0]; eta[0] += D[1]*phi[1]; eta[0] += D[2]*phi[2]; @@ -106,9 +111,22 @@ eta[2] += D[7]*phi[1]; eta[2] += D[8]*phi[2]; } + + static inline void mvm_PRECISION_new( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, eta[0*n_vect+j+jj] = D[0]*phi[0*n_vect+j+jj]; + eta[0*n_vect+j+jj] += D[1]*phi[1*n_vect+j+jj]; + eta[0*n_vect+j+jj] += D[2]*phi[2*n_vect+j+jj]; + eta[1*n_vect+j+jj] = D[3]*phi[0*n_vect+j+jj]; + eta[1*n_vect+j+jj] += D[4]*phi[1*n_vect+j+jj]; + eta[1*n_vect+j+jj] += D[5]*phi[2*n_vect+j+jj]; + eta[2*n_vect+j+jj] = D[6]*phi[0*n_vect+j+jj]; + eta[2*n_vect+j+jj] += D[7]*phi[1*n_vect+j+jj]; + eta[2*n_vect+j+jj] += D[8]*phi[2*n_vect+j+jj];) + } // eta = D**H*phi - static inline void mvmh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void mvmh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = conj_PRECISION(D[0])*phi[0]; eta[1] = conj_PRECISION(D[1])*phi[0]; eta[2] = conj_PRECISION(D[2])*phi[0]; @@ -119,9 +137,22 @@ eta[1] += conj_PRECISION(D[7])*phi[2]; eta[2] += conj_PRECISION(D[8])*phi[2]; } + + static inline void mvmh_PRECISION_new( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, eta[0*n_vect+j+jj] = conj_PRECISION(D[0])*phi[0*n_vect+j+jj]; + eta[1*n_vect+j+jj] = conj_PRECISION(D[1])*phi[0*n_vect+j+jj]; + eta[2*n_vect+j+jj] = conj_PRECISION(D[2])*phi[0*n_vect+j+jj]; + eta[0*n_vect+j+jj] += conj_PRECISION(D[3])*phi[1*n_vect+j+jj]; + eta[1*n_vect+j+jj] += conj_PRECISION(D[4])*phi[1*n_vect+j+jj]; + eta[2*n_vect+j+jj] += conj_PRECISION(D[5])*phi[1*n_vect+j+jj]; + eta[0*n_vect+j+jj] += conj_PRECISION(D[6])*phi[2*n_vect+j+jj]; + eta[1*n_vect+j+jj] += conj_PRECISION(D[7])*phi[2*n_vect+j+jj]; + eta[2*n_vect+j+jj] += conj_PRECISION(D[8])*phi[2*n_vect+j+jj];) + } // eta = -D*phi - static inline void nmvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void nmvm_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = - D[0]*phi[0]; eta[0] -= D[1]*phi[1]; eta[0] -= D[2]*phi[2]; @@ -134,7 +165,7 @@ } // eta = -D**H*phi - static inline void nmvmh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { + static inline void nmvmh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) { eta[0] = - conj_PRECISION(D[0])*phi[0]; eta[1] = - conj_PRECISION(D[1])*phi[0]; eta[2] = - conj_PRECISION(D[2])*phi[0]; @@ -147,7 +178,7 @@ } // 1 - gamma_T - static inline void prp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void prp_T_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+2]; @@ -155,9 +186,19 @@ prp_pt[4] = l_pt[4] -GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO+1]; prp_pt[5] = l_pt[5] -GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO+2]; } + + static inline void prp_T_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO*n_vect+j+jj]; + prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+1)*n_vect+j+jj]; + prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+2)*n_vect+j+jj]; + prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO*n_vect+j+jj]; + prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+1)*n_vect+j+jj]; + prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+2)*n_vect+j+jj];) + } // 1 + gamma_T - static inline void prn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prn_T_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+2]; @@ -166,8 +207,18 @@ prn_pt[5] = l_pt[5] +GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO+2]; } + static inline void prn_T_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO*n_vect+j+jj]; + prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+1)*n_vect+j+jj]; + prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+2)*n_vect+j+jj]; + prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO*n_vect+j+jj]; + prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+1)*n_vect+j+jj]; + prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+2)*n_vect+j+jj];) + } + // - (1 - gamma_T) - static inline void pbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_T_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -182,8 +233,24 @@ l_pt[11] += GAMMA_T_SPIN3_VAL*prp_su3_pt[3*GAMMA_T_SPIN3_CO+2]; } + static inline void pbp_su3_T_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] += GAMMA_T_SPIN2_VAL*prp_su3_pt[3*GAMMA_T_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] += GAMMA_T_SPIN2_VAL*prp_su3_pt[(3*GAMMA_T_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] += GAMMA_T_SPIN2_VAL*prp_su3_pt[(3*GAMMA_T_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] += GAMMA_T_SPIN3_VAL*prp_su3_pt[3*GAMMA_T_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] += GAMMA_T_SPIN3_VAL*prp_su3_pt[(3*GAMMA_T_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] += GAMMA_T_SPIN3_VAL*prp_su3_pt[(3*GAMMA_T_SPIN3_CO+2)*n_vect+j+jj];) + } + // -(1 + gamma_T) - static inline void pbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_T_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -198,7 +265,23 @@ l_pt[11] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[3*GAMMA_T_SPIN3_CO+2]; } - static inline void prp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_T_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[3*GAMMA_T_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[(3*GAMMA_T_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[(3*GAMMA_T_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[3*GAMMA_T_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[(3*GAMMA_T_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[(3*GAMMA_T_SPIN3_CO+2)*n_vect+j+jj];) + } + + static inline void prp_Z_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+2]; @@ -207,7 +290,17 @@ prp_pt[5] = l_pt[5] -GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO+2]; } - static inline void prn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prp_Z_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO*n_vect+j+jj]; + prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+1)*n_vect+j+jj]; + prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+2)*n_vect+j+jj]; + prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO*n_vect+j+jj]; + prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+1)*n_vect+j+jj]; + prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+2)*n_vect+j+jj];) + } + + static inline void prn_Z_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+2]; @@ -216,7 +309,17 @@ prn_pt[5] = l_pt[5] +GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO+2]; } - static inline void pbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void prn_Z_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO*n_vect+j+jj]; + prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+1)*n_vect+j+jj]; + prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+2)*n_vect+j+jj]; + prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO*n_vect+j+jj]; + prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+1)*n_vect+j+jj]; + prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+2)*n_vect+j+jj];) + } + + static inline void pbp_su3_Z_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -231,7 +334,23 @@ l_pt[11] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[3*GAMMA_Z_SPIN3_CO+2]; } - static inline void pbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_Z_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[3*GAMMA_Z_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[3*GAMMA_Z_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN3_CO+2)*n_vect+j+jj];) + } + + static inline void pbn_su3_Z_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -246,7 +365,23 @@ l_pt[11] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[3*GAMMA_Z_SPIN3_CO+2]; } - static inline void prp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_Z_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[3*GAMMA_Z_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[3*GAMMA_Z_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN3_CO+2)*n_vect+j+jj];) + } + + static inline void prp_Y_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+2]; @@ -255,7 +390,17 @@ prp_pt[5] = l_pt[5] -GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO+2]; } - static inline void prn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prp_Y_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO*n_vect+j+jj]; + prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+1)*n_vect+j+jj]; + prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+2)*n_vect+j+jj]; + prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO*n_vect+j+jj]; + prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+1)*n_vect+j+jj]; + prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+2)*n_vect+j+jj];) + } + + static inline void prn_Y_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+2]; @@ -264,7 +409,17 @@ prn_pt[5] = l_pt[5] +GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO+2]; } - static inline void pbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void prn_Y_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO*n_vect+j+jj]; + prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+1)*n_vect+j+jj]; + prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+2)*n_vect+j+jj]; + prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO*n_vect+j+jj]; + prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+1)*n_vect+j+jj]; + prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+2)*n_vect+j+jj];) + } + + static inline void pbp_su3_Y_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -279,7 +434,23 @@ l_pt[11] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[3*GAMMA_Y_SPIN3_CO+2]; } - static inline void pbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_Y_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[3*GAMMA_Y_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[3*GAMMA_Y_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN3_CO+2)*n_vect+j+jj];) + } + + static inline void pbn_su3_Y_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -294,7 +465,23 @@ l_pt[11] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[3*GAMMA_Y_SPIN3_CO+2]; } - static inline void prp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void pbn_su3_Y_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[3*GAMMA_Y_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[3*GAMMA_Y_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN3_CO+2)*n_vect+j+jj];) + } + + static inline void prp_X_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO]; prp_pt[1] = l_pt[1] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+1]; prp_pt[2] = l_pt[2] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+2]; @@ -303,7 +490,17 @@ prp_pt[5] = l_pt[5] -GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO+2]; } - static inline void prn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void prp_X_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO*n_vect+j+jj]; + prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+1)*n_vect+j+jj]; + prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+2)*n_vect+j+jj]; + prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO*n_vect+j+jj]; + prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+1)*n_vect+j+jj]; + prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+2)*n_vect+j+jj];) + } + + static inline void prn_X_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[0] = l_pt[0] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO]; prn_pt[1] = l_pt[1] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+1]; prn_pt[2] = l_pt[2] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+2]; @@ -312,7 +509,17 @@ prn_pt[5] = l_pt[5] +GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO+2]; } - static inline void pbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void prn_X_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO*n_vect+j+jj]; + prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+1)*n_vect+j+jj]; + prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+2)*n_vect+j+jj]; + prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO*n_vect+j+jj]; + prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+1)*n_vect+j+jj]; + prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+2)*n_vect+j+jj];) + } + + static inline void pbp_su3_X_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[0]; l_pt[ 1] -= prp_su3_pt[1]; l_pt[ 2] -= prp_su3_pt[2]; @@ -327,7 +534,23 @@ l_pt[11] += GAMMA_X_SPIN3_VAL*prp_su3_pt[3*GAMMA_X_SPIN3_CO+2]; } - static inline void pbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void pbp_su3_X_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] += GAMMA_X_SPIN2_VAL*prp_su3_pt[3*GAMMA_X_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] += GAMMA_X_SPIN2_VAL*prp_su3_pt[(3*GAMMA_X_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] += GAMMA_X_SPIN2_VAL*prp_su3_pt[(3*GAMMA_X_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] += GAMMA_X_SPIN3_VAL*prp_su3_pt[3*GAMMA_X_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] += GAMMA_X_SPIN3_VAL*prp_su3_pt[(3*GAMMA_X_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] += GAMMA_X_SPIN3_VAL*prp_su3_pt[(3*GAMMA_X_SPIN3_CO+2)*n_vect+j+jj];) + } + + static inline void pbn_su3_X_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[0]; l_pt[ 1] -= prn_su3_pt[1]; l_pt[ 2] -= prn_su3_pt[2]; @@ -342,6 +565,22 @@ l_pt[11] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[3*GAMMA_X_SPIN3_CO+2]; } + static inline void pbn_su3_X_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj]; + l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj]; + l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj]; + l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj]; + l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj]; + l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj]; + l_pt[ 6*n_vect+j+jj] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[3*GAMMA_X_SPIN2_CO*n_vect+j+jj]; + l_pt[ 7*n_vect+j+jj] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[(3*GAMMA_X_SPIN2_CO+1)*n_vect+j+jj]; + l_pt[ 8*n_vect+j+jj] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[(3*GAMMA_X_SPIN2_CO+2)*n_vect+j+jj]; + l_pt[ 9*n_vect+j+jj] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[3*GAMMA_X_SPIN3_CO*n_vect+j+jj]; + l_pt[10*n_vect+j+jj] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[(3*GAMMA_X_SPIN3_CO+1)*n_vect+j+jj]; + l_pt[11*n_vect+j+jj] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[(3*GAMMA_X_SPIN3_CO+2)*n_vect+j+jj];) + } + //START #ifdef HAVE_TM1p1 @@ -349,7 +588,7 @@ #define flav_gamma(k) (3*(k)+6*((k)/2)) // 1 - gamma_T - static inline void dprp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_T_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2]; @@ -365,7 +604,7 @@ } // 1 + gamma_T - static inline void dprn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_T_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2]; @@ -381,7 +620,7 @@ } // - (1 - gamma_T) - static inline void dpbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_T_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -409,7 +648,7 @@ } // -(1 + gamma_T) - static inline void dpbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_T_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -438,7 +677,7 @@ // 1 - gamma_Z - static inline void dprp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_Z_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2]; @@ -454,7 +693,7 @@ } // 1 + gamma_Z - static inline void dprn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_Z_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2]; @@ -470,7 +709,7 @@ } // - (1 - gamma_Z) - static inline void dpbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_Z_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -498,7 +737,7 @@ } // -(1 + gamma_Z) - static inline void dpbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_Z_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -527,7 +766,7 @@ // 1 - gamma_Y - static inline void dprp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_Y_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2]; @@ -543,7 +782,7 @@ } // 1 + gamma_Y - static inline void dprn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_Y_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2]; @@ -559,7 +798,7 @@ } // - (1 - gamma_Y) - static inline void dpbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_Y_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -587,7 +826,7 @@ } // -(1 + gamma_Y) - static inline void dpbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_Y_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -616,7 +855,7 @@ // 1 - gamma_X - static inline void dprp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + static inline void dprp_X_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) { prp_pt[ 0] = l_pt[ 0] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)]; prp_pt[ 1] = l_pt[ 1] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1]; prp_pt[ 2] = l_pt[ 2] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2]; @@ -632,7 +871,7 @@ } // 1 + gamma_X - static inline void dprn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + static inline void dprn_X_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) { prn_pt[ 0] = l_pt[ 0] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)]; prn_pt[ 1] = l_pt[ 1] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1]; prn_pt[ 2] = l_pt[ 2] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2]; @@ -648,7 +887,7 @@ } // - (1 - gamma_X) - static inline void dpbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbp_su3_X_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prp_su3_pt[ 0]; l_pt[ 1] -= prp_su3_pt[ 1]; l_pt[ 2] -= prp_su3_pt[ 2]; @@ -676,7 +915,7 @@ } // -(1 + gamma_X) - static inline void dpbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + static inline void dpbn_su3_X_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) { l_pt[ 0] -= prn_su3_pt[ 0]; l_pt[ 1] -= prn_su3_pt[ 1]; l_pt[ 2] -= prn_su3_pt[ 2]; @@ -706,7 +945,7 @@ #endif //END - static inline void twospin_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -733,7 +972,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -760,7 +999,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -787,7 +1026,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin_p_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -814,7 +1053,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -841,7 +1080,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -868,7 +1107,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin_p_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -895,7 +1134,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -922,7 +1161,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -949,7 +1188,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin_p_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_p_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -976,7 +1215,7 @@ out_spin2and3[11] -= in[11]; } - static inline void twospin2_p_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin2_p_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] = in[ 0]; out_spin0and1[ 1] = in[ 1]; out_spin0and1[ 2] = in[ 2]; @@ -1003,7 +1242,7 @@ out_spin2and3[11] = in[11]; } - static inline void twospin_n_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { + static inline void twospin_n_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; out_spin0and1[ 2] -= in[ 2]; @@ -1030,7 +1269,7 @@ out_spin2and3[11] -= in[11]; } - static inline void doublet_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void doublet_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; eta[ 1] = clover[ 1]*phi[ 1]; @@ -1182,7 +1421,7 @@ eta[23] += conj_PRECISION(clover[41])*phi[22]; } - static inline void spin0and1_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void spin0and1_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; eta[ 1] = clover[ 1]*phi[ 1]; @@ -1229,7 +1468,7 @@ eta[5] += conj_PRECISION(clover[26])*phi[4]; } - static inline void spin2and3_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void spin2and3_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = _COMPLEX_PRECISION_ZERO; eta[ 1] = _COMPLEX_PRECISION_ZERO; @@ -1276,7 +1515,7 @@ eta[11] += conj_PRECISION(clover[41])*phi[10]; } - static inline void site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + static inline void site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; eta[ 1] = clover[ 1]*phi[ 1]; @@ -1353,5 +1592,86 @@ eta[11] += conj_PRECISION(clover[40])*phi[ 9]; eta[11] += conj_PRECISION(clover[41])*phi[10]; } - + + + + static inline void site_clover_PRECISION_new( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) { + int n_vect = g.num_rhs_vect, j, jj; + VECTOR_LOOP(j, n_vect, jj, // diagonal + eta[ 0*n_vect+j+jj] = clover[ 0]*phi[ 0*n_vect+j+jj]; + eta[ 1*n_vect+j+jj] = clover[ 1]*phi[ 1*n_vect+j+jj]; + eta[ 2*n_vect+j+jj] = clover[ 2]*phi[ 2*n_vect+j+jj]; + eta[ 3*n_vect+j+jj] = clover[ 3]*phi[ 3*n_vect+j+jj]; + eta[ 4*n_vect+j+jj] = clover[ 4]*phi[ 4*n_vect+j+jj]; + eta[ 5*n_vect+j+jj] = clover[ 5]*phi[ 5*n_vect+j+jj]; + eta[ 6*n_vect+j+jj] = clover[ 6]*phi[ 6*n_vect+j+jj]; + eta[ 7*n_vect+j+jj] = clover[ 7]*phi[ 7*n_vect+j+jj]; + eta[ 8*n_vect+j+jj] = clover[ 8]*phi[ 8*n_vect+j+jj]; + eta[ 9*n_vect+j+jj] = clover[ 9]*phi[ 9*n_vect+j+jj]; + eta[10*n_vect+j+jj] = clover[10]*phi[10*n_vect+j+jj]; + eta[11*n_vect+j+jj] = clover[11]*phi[11*n_vect+j+jj]; + // spin 0 and 1, row major + eta[0*n_vect+j+jj] += clover[12]*phi[1*n_vect+j+jj]; + eta[0*n_vect+j+jj] += clover[13]*phi[2*n_vect+j+jj]; + eta[0*n_vect+j+jj] += clover[14]*phi[3*n_vect+j+jj]; + eta[0*n_vect+j+jj] += clover[15]*phi[4*n_vect+j+jj]; + eta[0*n_vect+j+jj] += clover[16]*phi[5*n_vect+j+jj]; + eta[1*n_vect+j+jj] += clover[17]*phi[2*n_vect+j+jj]; + eta[1*n_vect+j+jj] += clover[18]*phi[3*n_vect+j+jj]; + eta[1*n_vect+j+jj] += clover[19]*phi[4*n_vect+j+jj]; + eta[1*n_vect+j+jj] += clover[20]*phi[5*n_vect+j+jj]; + eta[2*n_vect+j+jj] += clover[21]*phi[3*n_vect+j+jj]; + eta[2*n_vect+j+jj] += clover[22]*phi[4*n_vect+j+jj]; + eta[2*n_vect+j+jj] += clover[23]*phi[5*n_vect+j+jj]; + eta[3*n_vect+j+jj] += clover[24]*phi[4*n_vect+j+jj]; + eta[3*n_vect+j+jj] += clover[25]*phi[5*n_vect+j+jj]; + eta[4*n_vect+j+jj] += clover[26]*phi[5*n_vect+j+jj]; + eta[1*n_vect+j+jj] += conj_PRECISION(clover[12])*phi[0*n_vect+j+jj]; + eta[2*n_vect+j+jj] += conj_PRECISION(clover[13])*phi[0*n_vect+j+jj]; + eta[3*n_vect+j+jj] += conj_PRECISION(clover[14])*phi[0*n_vect+j+jj]; + eta[4*n_vect+j+jj] += conj_PRECISION(clover[15])*phi[0*n_vect+j+jj]; + eta[5*n_vect+j+jj] += conj_PRECISION(clover[16])*phi[0*n_vect+j+jj]; + eta[2*n_vect+j+jj] += conj_PRECISION(clover[17])*phi[1*n_vect+j+jj]; + eta[3*n_vect+j+jj] += conj_PRECISION(clover[18])*phi[1*n_vect+j+jj]; + eta[4*n_vect+j+jj] += conj_PRECISION(clover[19])*phi[1*n_vect+j+jj]; + eta[5*n_vect+j+jj] += conj_PRECISION(clover[20])*phi[1*n_vect+j+jj]; + eta[3*n_vect+j+jj] += conj_PRECISION(clover[21])*phi[2*n_vect+j+jj]; + eta[4*n_vect+j+jj] += conj_PRECISION(clover[22])*phi[2*n_vect+j+jj]; + eta[5*n_vect+j+jj] += conj_PRECISION(clover[23])*phi[2*n_vect+j+jj]; + eta[4*n_vect+j+jj] += conj_PRECISION(clover[24])*phi[3*n_vect+j+jj]; + eta[5*n_vect+j+jj] += conj_PRECISION(clover[25])*phi[3*n_vect+j+jj]; + eta[5*n_vect+j+jj] += conj_PRECISION(clover[26])*phi[4*n_vect+j+jj]; + // spin 2 and 3, row major + eta[ 6*n_vect+j+jj] += clover[27]*phi[ 7*n_vect+j+jj]; + eta[ 6*n_vect+j+jj] += clover[28]*phi[ 8*n_vect+j+jj]; + eta[ 6*n_vect+j+jj] += clover[29]*phi[ 9*n_vect+j+jj]; + eta[ 6*n_vect+j+jj] += clover[30]*phi[10*n_vect+j+jj]; + eta[ 6*n_vect+j+jj] += clover[31]*phi[11*n_vect+j+jj]; + eta[ 7*n_vect+j+jj] += clover[32]*phi[ 8*n_vect+j+jj]; + eta[ 7*n_vect+j+jj] += clover[33]*phi[ 9*n_vect+j+jj]; + eta[ 7*n_vect+j+jj] += clover[34]*phi[10*n_vect+j+jj]; + eta[ 7*n_vect+j+jj] += clover[35]*phi[11*n_vect+j+jj]; + eta[ 8*n_vect+j+jj] += clover[36]*phi[ 9*n_vect+j+jj]; + eta[ 8*n_vect+j+jj] += clover[37]*phi[10*n_vect+j+jj]; + eta[ 8*n_vect+j+jj] += clover[38]*phi[11*n_vect+j+jj]; + eta[ 9*n_vect+j+jj] += clover[39]*phi[10*n_vect+j+jj]; + eta[ 9*n_vect+j+jj] += clover[40]*phi[11*n_vect+j+jj]; + eta[10*n_vect+j+jj] += clover[41]*phi[11*n_vect+j+jj]; + eta[ 7*n_vect+j+jj] += conj_PRECISION(clover[27])*phi[ 6*n_vect+j+jj]; + eta[ 8*n_vect+j+jj] += conj_PRECISION(clover[28])*phi[ 6*n_vect+j+jj]; + eta[ 9*n_vect+j+jj] += conj_PRECISION(clover[29])*phi[ 6*n_vect+j+jj]; + eta[10*n_vect+j+jj] += conj_PRECISION(clover[30])*phi[ 6*n_vect+j+jj]; + eta[11*n_vect+j+jj] += conj_PRECISION(clover[31])*phi[ 6*n_vect+j+jj]; + eta[ 8*n_vect+j+jj] += conj_PRECISION(clover[32])*phi[ 7*n_vect+j+jj]; + eta[ 9*n_vect+j+jj] += conj_PRECISION(clover[33])*phi[ 7*n_vect+j+jj]; + eta[10*n_vect+j+jj] += conj_PRECISION(clover[34])*phi[ 7*n_vect+j+jj]; + eta[11*n_vect+j+jj] += conj_PRECISION(clover[35])*phi[ 7*n_vect+j+jj]; + eta[ 9*n_vect+j+jj] += conj_PRECISION(clover[36])*phi[ 8*n_vect+j+jj]; + eta[10*n_vect+j+jj] += conj_PRECISION(clover[37])*phi[ 8*n_vect+j+jj]; + eta[11*n_vect+j+jj] += conj_PRECISION(clover[38])*phi[ 8*n_vect+j+jj]; + eta[10*n_vect+j+jj] += conj_PRECISION(clover[39])*phi[ 9*n_vect+j+jj]; + eta[11*n_vect+j+jj] += conj_PRECISION(clover[40])*phi[ 9*n_vect+j+jj]; + eta[11*n_vect+j+jj] += conj_PRECISION(clover[41])*phi[10*n_vect+j+jj];) + } + #endif diff --git a/src/gathering_generic.c b/src/gathering_generic.c index 2eb10fc..fbf0445 100644 --- a/src/gathering_generic.c +++ b/src/gathering_generic.c @@ -28,8 +28,8 @@ void gathering_PRECISION_next_level_init( gathering_PRECISION_struct *gs, level_ gs->permutation = NULL; gs->gather_list = NULL; gs->reqs = NULL; - gs->buffer = NULL; - gs->transfer_buffer = NULL; + vector_PRECISION_init(&(gs->buffer)); + vector_PRECISION_init(&(gs->transfer_buffer)); gs->dist_inner_lattice_sites = 1; gs->gather_list_length = 1; @@ -49,9 +49,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL; MALLOC( process_list, int, l->num_processes ); #ifdef HAVE_TM1p1 - MALLOC( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + MALLOC( gs->transfer_buffer.vector_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #else - MALLOC( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + MALLOC( gs->transfer_buffer.vector_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #endif l->idle = 0; @@ -96,9 +96,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l MALLOC( gs->permutation, int, l->num_inner_lattice_sites ); MALLOC( gs->reqs, MPI_Request, gs->gather_list_length ); #ifdef HAVE_TM1p1 - MALLOC( gs->buffer, complex_PRECISION, 2*l->inner_vector_size ); + vector_PRECISION_alloc( &(gs->buffer), _INNER, 2, l, no_threading ); #else - MALLOC( gs->buffer, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_alloc( &(gs->buffer), _INNER, 1, l, no_threading ); #endif MALLOC( field1, int, l->num_inner_lattice_sites ); MALLOC( field2, int, l->num_inner_lattice_sites ); @@ -212,19 +212,15 @@ void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l ) FREE( gs->gather_list, int, gs->gather_list_length ); FREE( gs->permutation, int, l->num_inner_lattice_sites ); FREE( gs->reqs, MPI_Request, gs->gather_list_length ); -#ifdef HAVE_TM1p1 - FREE( gs->buffer, complex_PRECISION, 2*l->inner_vector_size ); -#else - FREE( gs->buffer, complex_PRECISION, l->inner_vector_size ); -#endif + vector_PRECISION_free( &(gs->buffer), l, no_threading ); } MPI_Comm_free( &(gs->level_comm) ); MPI_Group_free( &(gs->level_comm_group) ); #ifdef HAVE_TM1p1 - FREE( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + FREE( gs->transfer_buffer.vector_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #else - FREE( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); + FREE( gs->transfer_buffer.vector_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); #endif } @@ -270,17 +266,17 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t, *pi = l->gs_PRECISION.permutation; - vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL; + buffer_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL; MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL, *odd_proj_reqs = NULL; - + #ifdef HAVE_TM1p1 - vector_PRECISION buffer_eps_term = NULL; + buffer_PRECISION buffer_eps_term = NULL; MPI_Request *eps_term_reqs = NULL; MALLOC( buffer_eps_term, complex_PRECISION, n*send_size_block ); MALLOC( eps_term_reqs, MPI_Request, n ); #endif #ifdef HAVE_TM - vector_PRECISION buffer_tm_term = NULL; + buffer_PRECISION buffer_tm_term = NULL; MPI_Request *tm_term_reqs = NULL; MALLOC( buffer_tm_term, complex_PRECISION, n*send_size_block ); MALLOC( tm_term_reqs, MPI_Request, n ); @@ -408,12 +404,12 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s l->dummy_p_PRECISION.eval_operator = apply_coarse_operator_PRECISION; } -void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, level_struct *l ) { +void vector_PRECISION_gather( vector_PRECISION *gath, vector_PRECISION *dist, level_struct *l ) { int send_size = l->gs_PRECISION.dist_inner_lattice_sites * l->num_lattice_site_var; if ( g.my_rank != l->parent_rank ) { - MPI_Send( dist, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart ); + MPI_Send( dist->vector_buffer, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart ); } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t=l->num_lattice_site_var, *pi = l->gs_PRECISION.permutation; @@ -421,12 +417,12 @@ void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, leve PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], + MPI_Irecv( buffer.vector_buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], l->gs_PRECISION.gather_list[i], g.comm_cart, &(l->gs_PRECISION.reqs[i]) ); PROF_PRECISION_STOP( _GD_COMM, n-1 ); for ( i=0; ivector_buffer[i]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; ivector_buffer[ t*pi[i] + j ] = buffer.vector_buffer[ t*i + j ]; } } -void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath, level_struct *l ) { +void vector_PRECISION_distribute( vector_PRECISION *dist, vector_PRECISION *gath, level_struct *l ) { int send_size = l->gs_PRECISION.dist_inner_lattice_sites * l->num_lattice_site_var; if ( g.my_rank != l->parent_rank ) { - MPI_Recv( dist, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart, MPI_STATUS_IGNORE ); + MPI_Recv( dist->vector_buffer, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart, MPI_STATUS_IGNORE ); } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t=l->num_lattice_site_var, *pi = l->gs_PRECISION.permutation; @@ -453,16 +449,16 @@ void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath, // permute data according to desired distributed data layout for ( i=0; ivector_buffer[ t*pi[i]+j ]; PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], + MPI_Isend( buffer.vector_buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], l->gs_PRECISION.gather_list[i], g.comm_cart, &(l->gs_PRECISION.reqs[i]) ); PROF_PRECISION_STOP( _GD_COMM, n-1 ); for ( i=0; ivector_buffer[i] = buffer.vector_buffer[i]; PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iglobal_splitting[mu] > 1 ) { @@ -34,49 +34,18 @@ void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECI for ( i=0; inum_boundary_sites[2*i]; - buffer = l->vbuf_PRECISION[8]+n*(boundary_start-l->num_inner_lattice_sites); - buffer_pt = buffer; + buffer.vector_buffer = l->vbuf_PRECISION[8].vector_buffer+n*(boundary_start-l->num_inner_lattice_sites); + buffer_pt.vector_buffer = buffer.vector_buffer; for ( i=0; ivector_buffer + n*boundary_table[i]; + for ( j=0; jvector_buffer+n*boundary_start, n*num_boundary_sites, MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) ); - MPI_Isend( buffer, n*num_boundary_sites, MPI_COMPLEX_PRECISION, - l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) ); - } -} - - -void negative_sendrecv_PRECISION_vectorized( complex_PRECISION *phi, const int mu, comm_PRECISION_struct *c, - level_struct *l, int count, complex_PRECISION *buffer ) { - // send dir = -1 - if( l->global_splitting[mu] > 1 ) { - - int i, j, num_boundary_sites = c->num_boundary_sites[2*mu+1], boundary_start, - *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var; - - complex_PRECISION *tmp_pt; - complex_PRECISION *buffer_pt; - - boundary_start = l->num_inner_lattice_sites; - for ( i=0; inum_boundary_sites[2*i]; - - buffer_pt = buffer; - - for ( i=0; ineighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) ); - MPI_Isend( buffer, count*n*num_boundary_sites, MPI_COMPLEX_PRECISION, + MPI_Isend( buffer.vector_buffer, n*num_boundary_sites, MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) ); } } @@ -89,8 +58,6 @@ void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_stru MPI_Wait( &(c->rreqs[2*mu+1]), MPI_STATUS_IGNORE ); } } - - void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l ) { int mu, nu, factor=1; @@ -141,12 +108,13 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str #endif } } - - if ( l->vbuf_PRECISION[8] == NULL ) { + if ( l->vbuf_PRECISION[8].vector_buffer == NULL ) { #ifdef HAVE_TM1p1 - MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size ); + //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 2, l, no_threading); + MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); #else - MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size ); + //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 1, l, no_threading); + MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size ); #endif } } @@ -160,14 +128,14 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) { FREE( c->buffer[2*mu], complex_PRECISION, c->max_length[mu] ); FREE( c->buffer[2*mu+1], complex_PRECISION, c->max_length[mu] ); } - - if ( l->vbuf_PRECISION[8] != NULL ) { -#ifdef HAVE_TM1p1 - FREE( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size ); -#else - FREE( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size ); -#endif - } + if ( l->vbuf_PRECISION[8].vector_buffer != NULL ){ + // vector_PRECISION_free( &(l->vbuf_PRECISION[8]), l, no_threading); +#ifdef HAVE_TM1p1 + FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size ); + #else + FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size ); + #endif + } } @@ -185,14 +153,14 @@ void ghost_sendrecv_init_PRECISION( const int type, comm_PRECISION_struct *c, le } -void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir, +void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ) { // does not allow sending in both directions at the same time if( l->global_splitting[mu] > 1 ) { int i, j, *table=NULL, mu_dir = 2*mu-MIN(dir,0), offset = c->offset, length[2] = {0,0}, comm_start = 0, table_start = 0; - vector_PRECISION buffer, phi_pt; + buffer_PRECISION buffer, phi_pt; if ( amount == _FULL_SYSTEM ) { length[0] = (c->num_boundary_sites[2*mu])*offset; @@ -229,7 +197,7 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir ghost_alloc_PRECISION( MAX(length[0],length[1]), c, l ); } - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; // dir = senddir if ( dir == 1 ) { @@ -268,7 +236,7 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir buffer += offset; } - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; phi_pt = phi + comm_start; if ( length[0] > 0 ) { @@ -289,13 +257,13 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir } -void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, +void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { int mu_dir = 2*mu-MIN(dir,0); int i, j, *table, offset = c->offset, length[2]={0,0}, table_start = 0; - vector_PRECISION buffer, phi_pt; + buffer_PRECISION buffer, phi_pt; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) @@ -322,7 +290,7 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, int num_boundary_sites = length[0]/offset; - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; table = c->boundary_table[2*mu+1] + table_start; if ( length[0] > 0 ) { @@ -375,17 +343,17 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, } -void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { +void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { int i, j, mu_dir = 2*mu-MIN(dir,0), nu, inv_mu_dir = 2*mu+1+MIN(dir,0), length, *table=NULL, comm_start, num_boundary_sites, site_var; - vector_PRECISION buffer, recv_pt, phi_pt; + buffer_PRECISION buffer, recv_pt, phi_pt; site_var = l->num_lattice_site_var; length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var; num_boundary_sites = c->num_boundary_sites[mu_dir]; - buffer = (vector_PRECISION)c->buffer[mu_dir]; + buffer = c->buffer[mu_dir]; if ( dir == -1 ) comm_start = l->vector_size; @@ -398,7 +366,7 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, ASSERT( c->in_use[mu_dir] == 0 ); c->in_use[mu_dir] = 1; - recv_pt = phi + comm_start; + recv_pt = phi->vector_buffer + comm_start; if ( length > 0 ) { PROF_PRECISION_START( _OP_COMM ); MPI_Irecv( recv_pt, length, MPI_COMPLEX_PRECISION, @@ -408,14 +376,14 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, table = c->boundary_table[inv_mu_dir]; for ( j=0; jvector_buffer + table[j]*site_var; for ( i=0; ibuffer[mu_dir]; + buffer = c->buffer[mu_dir]; if ( length > 0 ) { PROF_PRECISION_START( _OP_COMM ); @@ -427,7 +395,7 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, } -void ghost_update_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { +void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) { if( l->global_splitting[mu] > 1 ) { int mu_dir = 2*mu-MIN(dir,0), length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var; diff --git a/src/ghost_generic.h b/src/ghost_generic.h index 7b5b019..59a583c 100644 --- a/src/ghost_generic.h +++ b/src/ghost_generic.h @@ -22,22 +22,18 @@ #ifndef GHOST_PRECISION_HEADER #define GHOST_PRECISION_HEADER - void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECISION_struct *c, level_struct *l ); - - // as negative_sendrecv_PRECISION, but for count vectors stored in phi in vector-fused data layout - // buffer must be big enough to hold the surface data for count vectors (in one direction) - void negative_sendrecv_PRECISION_vectorized( complex_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l, int count, complex_PRECISION *buffer ); + void negative_sendrecv_PRECISION( vector_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l ); void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_struct *l ); void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l ); void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ); void ghost_sendrecv_init_PRECISION( const int type, comm_PRECISION_struct *c, level_struct *l ); - void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir, + void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ); - void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, + void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, const int amount, level_struct *l ); - void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); - void ghost_update_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); + void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); + void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ); #endif diff --git a/src/init.c b/src/init.c index cd83ce4..614c515 100644 --- a/src/init.c +++ b/src/init.c @@ -152,22 +152,26 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 ) { #endif - fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, + fgmres_MP_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _RIGHT, vcycle_float, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 - MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); - MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, 2*g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); #else - MALLOC( g.p.b, complex_double, l->inner_vector_size ); - MALLOC( g.p.x, complex_double, l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, l->inner_vector_size ); #endif #endif #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, g.tol, + fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _RIGHT, preconditioner, g.method==6?g5D_plus_clover_double:d_plus_clover_double, &(g.p), l ); } @@ -178,29 +182,36 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 ) { #endif - fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, + fgmres_MP_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _NOTHING, NULL, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 - MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); - MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, 2*g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); #else - MALLOC( g.p.b, complex_double, l->inner_vector_size ); - MALLOC( g.p.x, complex_double, l->inner_vector_size ); + vector_double_alloc( &(g.p.b), _INNER, g.num_rhs_vect, l, no_threading ); + vector_double_alloc( &(g.p.x), _INNER, g.num_rhs_vect, l, no_threading ); + //MALLOC( g.p.b, complex_double, l->inner_vector_size ); + //MALLOC( g.p.x, complex_double, l->inner_vector_size ); #endif #endif #ifdef INIT_ONE_PREC } else { #endif - fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, g.tol, + /*fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, + &(g.p), l );*/ + fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol, + _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double_new, &(g.p), l ); #ifdef INIT_ONE_PREC } #endif } else if ( g.method == -1 ) { - fgmres_double_struct_alloc( 4, g.restart*g.max_restart, l->inner_vector_size, g.tol, + fgmres_double_struct_alloc( 4, g.restart*g.max_restart, _INNER, g.tol, _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, &(g.p), l ); fine_level_double_alloc( l ); } @@ -361,14 +372,18 @@ void method_free( level_struct *l ) { #ifdef INIT_ONE_PREC if ( g.mixed_precision == 2 && g.method >= 0 ) { #endif - fgmres_MP_struct_free( &(g.p_MP) ); + fgmres_MP_struct_free( &(g.p_MP), l ); #if defined (INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) #ifdef HAVE_TM1p1 - FREE( g.p.b, complex_double, 2*l->inner_vector_size ); - FREE( g.p.x, complex_double, 2*l->inner_vector_size ); + vector_double_free( &(g.p.b), l, no_threading ); + vector_double_free( &(g.p.x), l, no_threading ); + //FREE( g.p.b, complex_double, 2*l->inner_vector_size ); + //FREE( g.p.x, complex_double, 2*l->inner_vector_size ); #else - FREE( g.p.b, complex_double, l->inner_vector_size ); - FREE( g.p.x, complex_double, l->inner_vector_size ); + vector_double_free( &(g.p.b), l, no_threading ); + vector_double_free( &(g.p.x), l, no_threading ); + //FREE( g.p.b, complex_double, l->inner_vector_size ); + //FREE( g.p.x, complex_double, l->inner_vector_size ); #endif #endif #ifdef INIT_ONE_PREC @@ -646,8 +661,8 @@ void l_init( level_struct *l ) { level_double_init( l ); level_float_init( l ); - - l->x = NULL; + + vector_double_init(&(l->x)); l->next_level = NULL; l->reqs = NULL; } @@ -679,6 +694,7 @@ void g_init( level_struct *l ) { g.cur_storage = 0; g.max_storage = 0; g.in_setup = 0; + g.num_rhs_vect = 0; } void read_global_info( FILE *in ) { @@ -1021,6 +1037,8 @@ void read_solver_parameters( FILE *in, level_struct *l ) { save_pt = &(g.downprop); g.downprop=1; read_parameter( &save_pt, "addDownPropagator:", "%d", 1, in, _DEFAULT_SET ); #endif + save_pt = &(g.num_rhs_vect); g.num_rhs_vect=1; + read_parameter( &save_pt, "number of rhs vectors:", "%d", 1, in, _DEFAULT_SET ); if ( g.randomize ) { srand( time( 0 ) + 1000*g.my_rank ); @@ -1085,13 +1103,6 @@ void validate_parameters( int ls, level_struct *l ) { int i; int mu; -#ifdef SSE - if ( !g.odd_even ) - warning0("The SSE implementation is based on the odd-even preconditioned code.\ - \n Switch on odd-even preconditioning in the input file.\n"); - ASSERT( g.odd_even ); -#endif - if ( g.method == 5 && g.interpolation != 0 ) { warning0("Multigrid with BiCGstab smoothing is not supported.\n Switching to FGMRES preconditioned with BiCGstab (g.interpolation=0).\n"); g.interpolation = 0; @@ -1115,14 +1126,6 @@ void validate_parameters( int ls, level_struct *l ) { ASSERT( DIVIDES( g.block_lattice[i][mu], g.local_lattice[i][mu] ) ); ASSERT( DIVIDES( g.global_lattice[i][mu]/g.global_lattice[i+1][mu], g.local_lattice[i][mu] ) ); ASSERT( DIVIDES( g.block_lattice[i][mu], g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ) ); -#ifdef SSE - if ( g.block_lattice[i][mu] != g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ) - warning0("when using SSE, Schwarz block size and aggregate size have to match.\n"); - ASSERT( g.block_lattice[i][mu] == g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ); - // it works everywhere but we have some problem with the vector size. - // TODO: check all vectora allocated with size l->inner_vector_size - ASSERT( g.num_eig_vect[i] % SIMD_LENGTH_float == 0 ); -#endif } if ( g.odd_even ) { @@ -1161,10 +1164,6 @@ void validate_parameters( int ls, level_struct *l ) { //LIST OF CASES WHICH SHOULD WORK, BUT DO NOT (TODO) -#ifdef SSE - ASSERT( g.mixed_precision ); -#endif - //TODO: Could work without, but you need to fix the setup phase. for ( i=0; iprof_PRECISION.name[_GRAM_SCHMIDT], "Gram-Schmidt, PRECISION" ); sprintf( l->prof_PRECISION.name[_GRAM_SCHMIDT_ON_AGGREGATES], "Gram-Schmidt on aggregates, PRECISION" ); sprintf( l->prof_PRECISION.name[_CPY], "copy operations, PRECISION" ); + sprintf( l->prof_PRECISION.name[_RS], "real scale operations, PRECISION" ); sprintf( l->prof_PRECISION.name[_SET], "set value operations, PRECISION" ); sprintf( l->prof_PRECISION.name[_PR], "interpolation and restriction, PRECISION" ); l->prof_PRECISION.flop[_PR] = level_ratio*l->num_lattice_site_var*8.0*(l->num_lattice_site_var/2); @@ -94,22 +95,21 @@ double prof_PRECISION_print( level_struct *l ) { return flop; } - void fine_level_PRECISION_alloc( level_struct *l ) { int n = 8; #ifdef HAVE_TM1p1 - MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i] = l->vbuf_PRECISION[0] + 2*i*l->vector_size; - MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size ); - l->p_PRECISION.x = l->p_PRECISION.b + 2*l->inner_vector_size; + for ( int i=0; ivbuf_PRECISION[i]), _ORDINARY, 2*g.num_rhs_vect, l, no_threading ); + } + vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 2*g.num_rhs_vect, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); #else - MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i] = l->vbuf_PRECISION[0] + i*l->vector_size; - MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size ); - l->p_PRECISION.x = l->p_PRECISION.b + l->inner_vector_size; + for ( int i=0; ivbuf_PRECISION[i]), _ORDINARY, g.num_rhs_vect, l, no_threading ); + } + vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, g.num_rhs_vect, l, no_threading ); + vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, g.num_rhs_vect, l, no_threading ); #endif } @@ -117,20 +117,10 @@ void fine_level_PRECISION_alloc( level_struct *l ) { void fine_level_PRECISION_free( level_struct *l ) { int n = 8; - -#ifdef HAVE_TM1p1 - FREE( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i] = NULL; - FREE( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size ); - l->p_PRECISION.x = NULL; -#else - FREE( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size ); - for ( int i=1; ivbuf_PRECISION[i] = NULL; - FREE( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size ); - l->p_PRECISION.x = NULL; -#endif + for ( int i=0; ivbuf_PRECISION[i]), l, no_threading ); + vector_PRECISION_free( &(l->p_PRECISION.b), l, no_threading ); + vector_PRECISION_free( &(l->p_PRECISION.x), l, no_threading ); } @@ -146,24 +136,26 @@ void next_level_PRECISION_setup( level_struct *l ) { coarsening_index_table_PRECISION_define( &(l->is_PRECISION), &(l->s_PRECISION), l ); if ( l->level == 1 && !l->next_level->idle ) { - fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, + fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, _ORDINARY, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, g.method==6?(g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION) :(g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->next_level->p_PRECISION), l->next_level ); } else { if ( g.kcycle ) { - fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, l->next_level->vector_size, g.kcycle_tol, + fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, _ORDINARY, g.kcycle_tol, _K_CYCLE, _RIGHT, vcycle_PRECISION, g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION, &(l->next_level->p_PRECISION), l->next_level ); } else { + vector_PRECISION_init(&(l->next_level->p_PRECISION.b)); + vector_PRECISION_init(&(l->next_level->p_PRECISION.x)); #ifdef HAVE_TM1p1 - MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size ); - l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + 2*l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading ); #else - MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size ); - l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading ); + vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading ); #endif l->next_level->p_PRECISION.v_start = 0; l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size; @@ -171,15 +163,13 @@ void next_level_PRECISION_setup( level_struct *l ) { } int i, n = (l->next_level->level>0)?6:4; + for ( i=0; inext_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size ); - for ( i=1; inext_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + 2*i*l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading ); #else - MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size ); - for ( i=1; inext_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + i*l->next_level->vector_size; + vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading ); #endif + } } } @@ -192,21 +182,13 @@ void next_level_PRECISION_free( level_struct *l ) { if ( ( l->level == 1 && !l->next_level->idle ) || g.kcycle ) { fgmres_PRECISION_struct_free( &(l->next_level->p_PRECISION), l->next_level ); } else { -#ifdef HAVE_TM1p1 - FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size ); -#else - FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size ); -#endif + vector_PRECISION_free( &(l->next_level->p_PRECISION.b), l->next_level, no_threading ); + vector_PRECISION_free( &(l->next_level->p_PRECISION.x), l->next_level, no_threading ); } int i, n = (l->next_level->level>0)?6:4; - for ( i=1; inext_level->vbuf_PRECISION[i] = NULL; -#ifdef HAVE_TM1p1 - FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size ); -#else - FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size ); -#endif + for ( i=0; inext_level->vbuf_PRECISION[i]), l->next_level, no_threading ); coarsening_index_table_PRECISION_free( &(l->is_PRECISION), l ); } @@ -217,7 +199,7 @@ void next_level_PRECISION_free( level_struct *l ) { void level_PRECISION_init( level_struct *l ) { for ( int i=0; i<9; i++ ) - l->vbuf_PRECISION[i] = NULL; + vector_PRECISION_init( &(l->vbuf_PRECISION[i]) ); operator_PRECISION_init( &(l->op_PRECISION) ); operator_PRECISION_init( &(l->oe_op_PRECISION) ); @@ -231,20 +213,23 @@ void level_PRECISION_init( level_struct *l ) { void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading ) { ASSERT( g.mixed_precision ); - vector_PRECISION v1 = NULL, v2 = NULL; + vector_PRECISION v1, v2; + vector_PRECISION_init(&v1); + vector_PRECISION_init(&v2); + double t0=0, t1=0; - PUBLIC_MALLOC( v1, complex_PRECISION, l->inner_vector_size ); - PUBLIC_MALLOC( v2, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_alloc(&v1, _INNER, 1, l, threading); + vector_PRECISION_alloc(&v2, _INNER, 1, l, threading); START_LOCKED_MASTER(threading) - vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l ); + vector_PRECISION_define_random( &v2, 0, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) START_MASTER(threading) t0 = MPI_Wtime(); END_MASTER(threading) for ( int i=0; iinner_vector_size ); - PUBLIC_FREE( v2, complex_PRECISION, l->inner_vector_size ); + vector_PRECISION_free(&v1, l, threading); + vector_PRECISION_free(&v2, l, threading); END_LOCKED_MASTER(threading) } diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c index 8981bec..b879824 100644 --- a/src/interpolation_generic.c +++ b/src/interpolation_generic.c @@ -21,39 +21,34 @@ #include "main.h" -#if ( !defined( SSE ) || !defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) ) - void interpolation_PRECISION_alloc( level_struct *l ) { int k, n = l->num_eig_vect; MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); - l->is_PRECISION.interpolation[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 64 ); - for ( k=1; kis_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size; - MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); - l->is_PRECISION.test_vector[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 64 ); - for ( k=1; kis_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size; + MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n ); + MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n ); + for ( k=0; kis_PRECISION.interpolation[k])); + vector_PRECISION_alloc(&(l->is_PRECISION.interpolation[k]), _ORDINARY, 1, l, no_threading ); + vector_PRECISION_init(&(l->is_PRECISION.test_vector[k])); + vector_PRECISION_alloc(&(l->is_PRECISION.test_vector[k]), _INNER, 1, l, no_threading ); } + MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); } void interpolation_PRECISION_dummy_alloc( level_struct *l ) { - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); + MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); + MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); } void interpolation_PRECISION_dummy_free( level_struct *l ) { - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); + FREE( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect ); + FREE( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect ); } @@ -61,17 +56,19 @@ void interpolation_PRECISION_free( level_struct *l ) { int n = l->num_eig_vect; - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size ); + for (int k=0; kis_PRECISION.interpolation[k]), l, no_threading ); + vector_PRECISION_free(&(l->is_PRECISION.test_vector[k]), l, no_threading ); + } FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n ); - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n ); + FREE( l->is_PRECISION.test_vector, vector_PRECISION, n ); + FREE( l->is_PRECISION.interpolation, vector_PRECISION, n ); FREE( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); } -void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) { +void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading ) { int j, num_eig_vect = l->num_eig_vect; complex_PRECISION *operator = l->is_PRECISION.operator; @@ -83,31 +80,31 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, operator += start*num_eig_vect; for ( int i=start; iis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); + vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); + vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; int sign = 1; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; int sign = 1; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1, num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; + complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer, + *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer; #ifdef HAVE_TM1p1 if( g.n_flavours==2 ) for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( j=0; j<2*2*num_eig_vect; j++ ) @@ -280,8 +277,8 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str else #endif for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + phi_pt = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect; operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; for ( j=0; j<2*num_eig_vect; j++ ) @@ -303,9 +300,8 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str SYNC_HYPERTHREADS(threading) START_LOCKED_MASTER(threading) - vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level ); + vector_PRECISION_gather( phi_c, &(l->next_level->gs_PRECISION.transfer_buffer), l->next_level ); END_LOCKED_MASTER(threading) PROF_PRECISION_STOP( _PR, 1, threading ); } -#endif diff --git a/src/interpolation_generic.h b/src/interpolation_generic.h index 97be6ec..43c65d2 100644 --- a/src/interpolation_generic.h +++ b/src/interpolation_generic.h @@ -29,10 +29,10 @@ void interpolation_PRECISION_dummy_alloc( level_struct *l ); void interpolation_PRECISION_dummy_free( level_struct *l ); - void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ); - void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ); + void interpolate3_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ); + void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_struct *l, struct Thread *threading ); - void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ); + void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading ); #endif diff --git a/src/io.c b/src/io.c index 02b5ceb..fe9272a 100644 --- a/src/io.c +++ b/src/io.c @@ -716,8 +716,9 @@ void vector_io( double *phi, char *filename, const int mode, level_struct *l ) { FREE( buffer[0].data, double, bar_size ); FREE( buffer[1].data, double, bar_size ); } - - norm = global_norm_double( (vector_double)phi, 0, l->inner_vector_size, l, no_threading ); + vector_double phi_vec; + phi_vec.vector_buffer = (buffer_double) phi; + norm = global_norm_double( &phi_vec, 0, l->inner_vector_size, l, no_threading ); printf0("norm: %e\n", norm ); printf0("...done (%lf seconds)\n\n", t1-t0 ); } @@ -871,7 +872,7 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i ASSERT( fread( buffer_pt->data, sizeof(double), bar_size, file ) ); } - phi=(double *) (l->x); + phi=(double *) (&(l->x)); phi_pt=phi; for ( t=0; tis_float.test_vector[j], l->x, l->s_float.op.translation_table, l, no_threading); + trans_float(&(l->is_float.test_vector[j]), &(l->x), l->s_float.op.translation_table, l, no_threading); else - trans_double(l->is_double.test_vector[j], l->x, l->s_double.op.translation_table, l, no_threading); + trans_double(&(l->is_double.test_vector[j]), &(l->x), l->s_double.op.translation_table, l, no_threading); } else { - vector_double_copy( ((vector_double)psi)+j*l->inner_vector_size, l->x, 0, l->inner_vector_size, l ); + vector_double psi_vec; + psi_vec.vector_buffer = ((buffer_double) psi) + j*l->inner_vector_size; + vector_double_copy( &psi_vec, &(l->x), 0, l->inner_vector_size, l ); } } } else if ( mode == _WRITE ) { @@ -927,13 +930,15 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i for ( j=0; jx, l->is_float.test_vector[j], l->s_float.op.translation_table, l, no_threading ); + trans_back_float( &(l->x), &(l->is_float.test_vector[j]), l->s_float.op.translation_table, l, no_threading ); else - trans_back_double( l->x, l->is_double.test_vector[j], l->s_double.op.translation_table, l, no_threading ); + trans_back_double( &(l->x), &(l->is_double.test_vector[j]), l->s_double.op.translation_table, l, no_threading ); } else { - vector_double_copy( l->x, ((complex_double*)psi)+j*l->inner_vector_size, 0, l->inner_vector_size, l ); + vector_double psi_vec; + psi_vec.vector_buffer = ((complex_double*)psi)+j*l->inner_vector_size; + vector_double_copy( &(l->x), &psi_vec, 0, l->inner_vector_size, l ); } - phi=(double *)(l->x); + phi=(double *)(&(l->x)); phi_pt=phi; for ( t=0; tnext->data, bar_size, MPI_DOUBLE, desired_rank, 0, g.comm_cart, &rreq ); } diff --git a/src/linalg.c b/src/linalg.c index 3487404..e884e46 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -21,9 +21,8 @@ #include "main.h" -#ifndef OPTIMIZED_LINALG_float void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, - vector_float psi, int start, int end, level_struct *l, + vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_float_START( _PIP, threading ); @@ -39,7 +38,7 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); for(int c=0; cvector_buffer[i]; i++; ) } } @@ -60,9 +59,30 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } -#endif -double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) { + +void process_multi_inner_product_MP_new( int count, complex_double *results, vector_float *phi, + vector_float *psi, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, psi->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_float_START( _PIP, threading ); + + int i, j, jj; + for(int c=0; cnum_vect, jj, results[c*psi->num_vect+j+jj] = 0.0;) + + for(int c=0; cnum_vect, jj, results[c*psi->num_vect+j+jj] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j+jj])*psi->vector_buffer[i*psi->num_vect+j+jj];) + + if(thread == 0 && start != end) + PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); +} + +double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_float_START( _GIP, threading ); @@ -75,7 +95,7 @@ double global_norm_MP( vector_float x, int start, int end, level_struct *l, stru SYNC_CORES(threading) for ( i=thread_start; ivector_buffer[i]); i++; ) // sum over cores START_NO_HYPERTHREADS(threading) @@ -109,3 +129,23 @@ double global_norm_MP( vector_float x, int start, int end, level_struct *l, stru return sqrt((double)local_alpha); } } + +void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_float_START( _GIP, threading ); + + int i, j, jj; + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj]=0;) + + for( i=start; inum_vect, jj, res[j+jj] += NORM_SQUARE_float(x->vector_buffer[i*x->num_vect+j+jj]);) + + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] = (double)sqrt((double)res[j+jj]);) + + if(thread == 0 && start != end) + PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); +} diff --git a/src/linalg.h b/src/linalg.h index 4182def..80e9514 100644 --- a/src/linalg.h +++ b/src/linalg.h @@ -24,16 +24,26 @@ struct Thread; - void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha, + void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_double *alpha, int sign, int count, int start, int end, level_struct *l ); - void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha, + void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float *alpha, int sign, int count, int start, int end, level_struct *l ); - + + void vector_double_multi_saxpy_new( vector_double *z, vector_double *V, complex_double *alpha, + int sign, int count, level_struct *l, struct Thread *threading ); + + void vector_float_multi_saxpy_new( vector_float *z, vector_float *V, complex_float *alpha, + int sign, int count, level_struct *l, struct Thread *threading ); + void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, - vector_float psi, int start, int end, level_struct *l, + vector_float *psi, int start, int end, level_struct *l, struct Thread *threading ); - - double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ); + void process_multi_inner_product_MP_new( int count, complex_double *results, vector_float *phi, + vector_float *psi, level_struct *l, struct Thread *threading ); + + double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ); + + void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linalg_generic.c b/src/linalg_generic.c index db223bd..22f520f 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -21,12 +21,7 @@ #include "main.h" -#include "sse_float_intrinsic.h" -#include "sse_linalg.h" -#include "sse_linalg_PRECISION.h" - -#ifndef OPTIMIZED_LINALG_PRECISION -complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { +complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); complex_PRECISION local_alpha = 0, global_alpha = 0; @@ -37,7 +32,7 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_P SYNC_CORES(threading) - VECTOR_FOR( int i=thread_start, ivector_buffer[i])*psi->vector_buffer[i], i++, l ); // sum over cores START_NO_HYPERTHREADS(threading) @@ -71,10 +66,9 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_P return local_alpha; } } -#endif -complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { +complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _PIP, threading ); int i; @@ -82,7 +76,7 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_ SYNC_CORES(threading) - THREADED_VECTOR_FOR( i, start, end, local_alpha += conj_PRECISION(phi[i])*psi[i], i++, l, threading ); + THREADED_VECTOR_FOR( i, start, end, local_alpha += conj_PRECISION(phi->vector_buffer[i])*psi->vector_buffer[i], i++, l, threading ); START_NO_HYPERTHREADS(threading) ((complex_PRECISION *)threading->workspace)[threading->core] = local_alpha; @@ -103,8 +97,7 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_ } -#if !defined( OPTIMIZED_LINALG_PRECISION ) -void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi, +void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _PIP, threading ); @@ -120,18 +113,18 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); for(int c=0; cvector_buffer[i]; i++; ) } else { #ifdef _M10TV compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 20); for(int c=0; cvector_buffer[i]; i++; ) #else compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2); for(int c=0; cvector_buffer[i]; i++; ) #endif } @@ -152,14 +145,35 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } -#endif -complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l ) { + +void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, + level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, psi->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _PIP, threading ); + + int i, j, jj; + VECTOR_LOOP(j, count*psi->num_vect, jj, results[j+jj] = 0.0;) + + for(int c=0; cnum_vect, jj, results[c*psi->num_vect+j+jj] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+jj])*psi->vector_buffer[i*psi->num_vect+j+jj];) + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); +} + + +complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ) { complex_PRECISION numerator = 0.0; PRECISION denominator = 0.0; - VECTOR_FOR( int i=start, ivector_buffer[i])*psi->vector_buffer[i]; denominator += NORM_SQUARE_PRECISION(phi->vector_buffer[i]), i++, l ); if ( abs_PRECISION(denominator) < EPS_PRECISION ) { return 0.0; @@ -168,8 +182,7 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECI return numerator/denominator; } -#ifndef OPTIMIZED_LINALG_PRECISION -PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) { +PRECISION global_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); @@ -181,7 +194,7 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s SYNC_CORES(threading) - VECTOR_FOR( int i=thread_start, ivector_buffer[i]), i++, l ); // sum over cores START_NO_HYPERTHREADS(threading) @@ -215,9 +228,8 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s return (PRECISION)sqrt((double)local_alpha); } } -#endif -PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) { +PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) { int i; PRECISION local_alpha = 0; @@ -225,7 +237,7 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_ SYNC_CORES(threading) - THREADED_VECTOR_FOR( i, start, end, local_alpha += NORM_SQUARE_PRECISION(x[i]), i++, l, threading ); + THREADED_VECTOR_FOR( i, start, end, local_alpha += NORM_SQUARE_PRECISION(x->vector_buffer[i]), i++, l, threading ); START_NO_HYPERTHREADS(threading) ((PRECISION *)threading->workspace)[threading->core] = local_alpha; @@ -245,65 +257,130 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_ return (PRECISION)sqrt((double)local_alpha); } +void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _GIP, threading ); + + int i, j, jj; + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj]=0;) + + for( i=start; inum_vect, jj, res[j+jj] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j+jj]);) + + VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] = (PRECISION)sqrt((double)res[j+jj]);) + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); +} + -void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) { +void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i] + y->vector_buffer[i], i++, l ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); } -void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) { - +void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { + + int i, j, jj, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - VECTOR_FOR( int i=start, inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] + y->vector_buffer[i*x->num_vect+j+jj];) + if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); } -#ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ) { + +void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) { + + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA2 ); + + VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i] - y->vector_buffer[i], i++, l ); + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); +} + + +void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) { + + int i, j, jj, start, end; + compute_core_start_end(0, y->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA2 ); + + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - y->vector_buffer[i*x->num_vect+j+jj];) + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size ); +} + +void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { + int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA6 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = alpha*x->vector_buffer[i], i++, l ); if(thread == 0 && start != end) PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); } -#endif +void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ) { + + int i, j, jj, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _LA6 ); + + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = alpha[k*x->num_vect+j+jj]*x->vector_buffer[i*x->num_vect+j+jj];) -void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size ); +} + + +void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) { - + PRECISION *r_z = (PRECISION*)z, *r_x = (PRECISION*)x, r_alpha = creal_PRECISION(alpha); int r_start = 2*start, r_end = 2*end; - + int thread = omp_get_thread_num(); if(thread == 0 && start != end) PROF_PRECISION_START( _LA2 ); - + REAL_VECTOR_FOR( int i=r_start, iinner_vector_size ); } -void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ) { +void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); if(thread == 0 && start != end) @@ -315,22 +392,41 @@ void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, i PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } -#ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ) { - +void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) { + int thread = omp_get_thread_num(); if (thread == 0 && start != end ) PROF_PRECISION_START( _LA8 ); - VECTOR_FOR( int i=start, ivector_buffer[i] = x->vector_buffer[i] + alpha*y->vector_buffer[i], i++, l ); if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); } -#endif -#ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha, +// New input variable: sign +// sign == 1 : plus +// else: minus +void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ) { + + int i, j, jj, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if (thread == 0 && start != end ) + PROF_PRECISION_START( _LA8 ); + + if( sign == 1 ) + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] + alpha[k*x->num_vect+j+jj]*y->vector_buffer[i*x->num_vect+j+jj];) + else + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - alpha[k*x->num_vect+j+jj]*y->vector_buffer[i*x->num_vect+j+jj];) + + if( thread == 0 && start != end ) + PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size ); +} + +void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, int count, int start, int end, level_struct *l ) { int thread = omp_get_thread_num(); @@ -344,36 +440,58 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, comp for ( int c=0; cvector_buffer[i] += V[c].vector_buffer[i]*alpha_signed[c]; i++; ) } } if( thread == 0 && start != end ) PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); } -#endif -void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, +void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha, + int sign, int count, level_struct *l, struct Thread *threading ) { + + int c, i, j, jj, start, end; + compute_core_start_end(0, z->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if (thread == 0 && start != end ) + PROF_PRECISION_START( _LA8 ); + + complex_PRECISION alpha_signed[count*z->num_vect]; + for ( c=0; cnum_vect, jj, alpha_signed[c*z->num_vect+j+jj] = sign*alpha[c*z->num_vect+j+jj];) + + for ( c=0; cnum_vect, jj, z->vector_buffer[i*z->num_vect+j+jj] += V[c].vector_buffer[i*z->num_vect+j+jj]*alpha_signed[c];) + + if( thread == 0 && start != end ) + PROF_PRECISION_STOP( _LA8, (PRECISION)(count) ); +} + +void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ) { int j, start, end; compute_core_start_end( 0, l->inner_vector_size, &start, &end, l, threading ); - vector_PRECISION v_tmp = NULL, *W_tmp = NULL; + vector_PRECISION v_tmp, *W_tmp = NULL; complex_PRECISION ip[k], ip_buffer[2*k]; - MALLOC( v_tmp, complex_PRECISION, l->inner_vector_size ); - vector_PRECISION_define(v_tmp, 0, 0, l->inner_vector_size, l ); + vector_PRECISION_init( &v_tmp ); + + vector_PRECISION_alloc( &v_tmp, _INNER, 1, l, no_threading ); + vector_PRECISION_define( &v_tmp, 0, 0, l->inner_vector_size, l ); - MALLOC( W_tmp, complex_PRECISION*, k ); - W_tmp[0] = NULL; - MALLOC( W_tmp[0], complex_PRECISION, k*l->inner_vector_size ); - for ( j = 1; jinner_vector_size; + MALLOC( W_tmp, vector_PRECISION, k ); + for ( j = 0; jinner_vector_size, l ); + vector_PRECISION_scale( &W_tmp[j], W+j, diag[j], 0, l->inner_vector_size, l ); } process_multi_inner_product_PRECISION( k, ip, W_tmp, v, 0, l->inner_vector_size, l, threading ); @@ -385,16 +503,18 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) - vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l ); + vector_PRECISION_multi_saxpy( &v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l ); if (orthogonal) - vector_PRECISION_minus( z, v, v_tmp, 0, l->inner_vector_size, l ); + vector_PRECISION_minus( z, v, &v_tmp, 0, l->inner_vector_size, l ); else - vector_PRECISION_copy( z, v_tmp, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( z, &v_tmp, 0, l->inner_vector_size, l ); - FREE( v_tmp, complex_PRECISION, l->inner_vector_size ); - FREE( W_tmp[0], complex_PRECISION, k*l->inner_vector_size ); - FREE( W_tmp, complex_PRECISION*, k ); + vector_PRECISION_free( &v_tmp, l, no_threading ); + for ( j = 0; jn_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { for ( k1=0; k1 V[k2] | 2*j-th and 2*j+1-st aggregate for ( i=0; ivector_buffer[i] = alpha*phi->vector_buffer[i]; eta2->vector_buffer[i] = _COMPLEX_PRECISION_ZERO; i++; ) + FOR6( eta2->vector_buffer[i] = alpha*phi->vector_buffer[i]; eta1->vector_buffer[i] = _COMPLEX_PRECISION_ZERO; i++; ) } PROF_PRECISION_STOP( _LA6, 1 ); } -void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) { +void set_boundary_PRECISION( vector_PRECISION *phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _SET, threading ); int i; SYNC_CORES(threading) - THREADED_VECTOR_FOR( i, l->inner_vector_size, l->vector_size, phi[i] = alpha, i++, l, threading ); + THREADED_VECTOR_FOR( i, l->inner_vector_size, l->vector_size, phi->vector_buffer[i] = alpha, i++, l, threading ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SET, (double)(l->vector_size-l->inner_vector_size)/(double)l->inner_vector_size, threading ); @@ -496,7 +616,7 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con for ( i=begin; iinner_vector_size, l, threading ); + process_multi_inner_product_PRECISION( i, tmp, V, &V[i], 0, l->inner_vector_size, l, threading ); SYNC_CORES(threading) START_MASTER(threading) for ( j=0; jinner_vector_size, l, threading ); + beta = global_norm_PRECISION( &V[i], 0, l->inner_vector_size, l, threading ); SYNC_MASTER_TO_ALL(threading) - vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l ); + vector_PRECISION_real_scale( &V[i], &V[i], creal(1.0/beta), start, end, l ); SYNC_CORES(threading) } @@ -534,7 +654,6 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con } -#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION ) void setup_gram_schmidt_PRECISION_compute_dots( complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, int start, int end, level_struct *l, struct Thread *threading) { @@ -543,6 +662,8 @@ void setup_gram_schmidt_PRECISION_compute_dots( int thread_end; int cache_block_size = 12*64; complex_PRECISION tmp[cache_block_size]; + vector_PRECISION tmp_vect; + tmp_vect.vector_buffer = tmp; for(int i=0; i<2*offset; i++) thread_buffer[i] = 0.0; @@ -551,11 +672,11 @@ void setup_gram_schmidt_PRECISION_compute_dots( compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); for ( int i=thread_start; idepth > 0 ) { - coarse_gamma5_PRECISION( g5v, V[i], thread_start, thread_end, l ); + coarse_gamma5_PRECISION( g5v, &V[i], thread_start, thread_end, l ); for ( j=0; jdepth > 0 ) { for( j=0; jinner_vector_size, threading ); } diff --git a/src/linalg_generic.h b/src/linalg_generic.h index 9bd7a20..9f6f7be 100644 --- a/src/linalg_generic.h +++ b/src/linalg_generic.h @@ -99,24 +99,31 @@ struct Thread; - complex_PRECISION global_inner_product_PRECISION( vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l, struct Thread *threading ); - complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ); + complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l, struct Thread *threading ); + complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ); - void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi, + void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ); + void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi, + level_struct *l, struct Thread *threading ); - PRECISION global_norm_PRECISION( vector_PRECISION phi, int start, int end, level_struct *l, struct Thread *threading ); - PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ); - - complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l ); - void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x + y - void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x - y - void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x - void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, + PRECISION global_norm_PRECISION( vector_PRECISION *phi, int start, int end, level_struct *l, struct Thread *threading ); + PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ); + void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struct *l, struct Thread *threading ); + + complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l ); + void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x + y + void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ); + void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x - y + void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ); + void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x + void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ); + void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); - void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y - void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ); // z := x - void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, + void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y + void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ); + void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ); // z := x + void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ); void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); @@ -135,11 +142,11 @@ int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n, level_struct *l, struct Thread *threading ); - void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v, + void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION *g5v, complex_PRECISION *buffer, const int n, level_struct *l, struct Thread *threading ); - void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, complex_PRECISION alpha, int start, int end, level_struct *l ); - void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ); + void spinwise_PRECISION_skalarmultiply( vector_PRECISION *eta1, vector_PRECISION *eta2, + vector_PRECISION *phi, complex_PRECISION alpha, int start, int end, level_struct *l ); + void set_boundary_PRECISION( vector_PRECISION *phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linsolve.c b/src/linsolve.c index bc24c81..df5eca2 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -28,10 +28,10 @@ void fgmres_MP_struct_init( gmres_MP_struct *p ) { } -void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int prec_kind, +void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const int prec_kind, void (*precond)(), gmres_MP_struct *p, level_struct *l ) { long int total=0; - int i, k=0; + int i, k=0, n_vl=g.num_rhs_vect; p->dp.restart_length = m; p->sp.restart_length = m; p->dp.num_restart = n; p->sp.num_restart = n; @@ -39,7 +39,7 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr if ( g.method == 6 ) { p->dp.eval_operator = g5D_plus_clover_double; p->sp.eval_operator = g5D_plus_clover_float; } else { - p->dp.eval_operator = d_plus_clover_double; p->sp.eval_operator = d_plus_clover_float; + p->dp.eval_operator = d_plus_clover_double_new; p->sp.eval_operator = d_plus_clover_float_new; } p->dp.tol = tol; p->sp.tol = MAX(tol,1E-5); p->dp.kind = _NOTHING; p->sp.kind = prec_kind; @@ -56,19 +56,18 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr if ( g.method == 6 ) { g.p.eval_operator = g5D_plus_clover_double; } else { - g.p.eval_operator = d_plus_clover_double; + g.p.eval_operator = d_plus_clover_double_new; } #ifdef HAVE_TM1p1 - vl*=2; + n_vl*=2; #endif // double precision part total = 0; - total += (m+1)*m; // Hessenberg matrix + total += (m+1)*m*n_vl; // Hessenberg matrix MALLOC( p->dp.H, complex_double*, m ); - total += 4*(m+1); // y, gamma, c, s - total += 3*vl; // x, r, b + total += 4*(m+1)*n_vl; // y, gamma, c, s p->dp.total_storage = total; // precomputed storage amount @@ -79,58 +78,54 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr total = 0; // H for ( i=1; idp.H[i] = p->dp.H[0] + i*(m+1); - total += m*(m+1); + p->dp.H[i] = p->dp.H[0] + i*(m+1)*n_vl; + total += m*(m+1)*n_vl; // y - p->dp.y = p->dp.H[0] + total; total += m+1; + p->dp.y = p->dp.H[0] + total; total += (m+1)*n_vl; // gamma - p->dp.gamma = p->dp.H[0] + total; total += m+1; + p->dp.gamma = p->dp.H[0] + total; total += (m+1)*n_vl; // c - p->dp.c = p->dp.H[0] + total; total += m+1; + p->dp.c = p->dp.H[0] + total; total += (m+1)*n_vl; // s - p->dp.s = p->dp.H[0] + total; total += m+1; + p->dp.s = p->dp.H[0] + total; total += (m+1)*n_vl; // x - p->dp.x = p->dp.H[0] + total; total += vl; + vector_double_alloc( &(p->dp.x), vl_type, n_vl, l, no_threading ); // r - p->dp.r = p->dp.H[0] + total; total += vl; + vector_double_alloc( &(p->dp.r), vl_type, n_vl, l, no_threading ); // b - p->dp.b = p->dp.H[0] + total; total += vl; + vector_double_alloc( &(p->dp.b), vl_type, n_vl, l, no_threading ); ASSERT( p->dp.total_storage == total ); // single precision part total = 0; - total += (2+m)*vl; // w, V - MALLOC( p->sp.V, complex_float*, m+1 ); + MALLOC( p->sp.V, vector_float, m+1 ); if ( precond != NULL ) { if ( prec_kind == _RIGHT ) { - total += (m+1)*vl; // Z k = m+1; } else { - total += vl; k = 1; } - MALLOC( p->sp.Z, complex_float*, k ); + MALLOC( p->sp.Z, vector_float, k ); } p->sp.total_storage = total; // precomputed storage amount - p->sp.w = NULL; - MALLOC( p->sp.w, complex_float, total ); - // reserve storage total = 0; // w - p->sp.w = p->sp.w + total; total += vl; + vector_float_alloc( &(p->sp.w), vl_type, n_vl, l, no_threading ); // V for ( i=0; isp.V[i] = p->sp.w + total; total += vl; + vector_float_init( &(p->sp.V[i]) ); + vector_float_alloc( &(p->sp.V[i]), vl_type, n_vl, l, no_threading ); } // Z if ( precond != NULL ) { for ( i=0; isp.Z[i] = p->sp.w + total; total += vl; + vector_float_init( &(p->sp.Z[i]) ); + vector_float_alloc( &(p->sp.Z[i]), vl_type, n_vl, l, no_threading ); } } @@ -138,18 +133,20 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr } -void fgmres_MP_struct_free( gmres_MP_struct *p ) { +void fgmres_MP_struct_free( gmres_MP_struct *p, level_struct *l ) { // single precision - FREE( p->sp.w, complex_float, p->sp.total_storage ); - FREE( p->sp.V, complex_float*, p->sp.restart_length+1 ); + vector_float_free( &(p->sp.w), l, no_threading ); + FREE( p->sp.V, vector_float, p->sp.restart_length+1 ); if ( p->sp.Z != NULL ) - FREE( p->sp.Z, complex_float*, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 ); + FREE( p->sp.Z, vector_float, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 ); // double precision FREE( p->dp.H[0], complex_double, p->dp.total_storage ); FREE( p->dp.H, complex_double*, p->dp.restart_length ); - + vector_double_free( &(p->dp.x), l, no_threading ); + vector_double_free( &(p->dp.r), l, no_threading ); + vector_double_free( &(p->dp.b), l, no_threading ); } @@ -166,11 +163,17 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { int start; int end; - int j=-1, finish=0, iter=0, il, ol; - complex_double gamma0 = 0; - complex_double beta = 0; + int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, jj; + complex_double gamma0[n_vect];//gamma0=0; + double beta[n_vect]; //beta=0; - double norm_r0=1, gamma_jp1=1, t0=0, t1=0; + double t0=0, t1=0; + double norm_r0[n_vect], gamma_jp1[n_vect], gamma0_real[n_vect], gamma_tot, H_tot, gamma_tot2;//norm_r0=1, gamma_jp1=1 + complex_float gamma_float[n_vect]; + + VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj]=1; + gamma_jp1[i+jj]=1;) + START_LOCKED_MASTER(threading) #ifndef WILSON_BENCHMARK if ( l->depth==0 && ( p->dp.timing || p->dp.print ) ) prof_init( l ); @@ -185,97 +188,124 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { SYNC_MASTER_TO_ALL(threading) // compute start and end indices for core // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(p->dp.v_start, p->dp.v_end, &start, &end, l, threading); + //compute_core_start_end(p->dp.v_start, p->dp.v_end, &start, &end, l, threading); // Outer loop in double precision for( ol=0; oldp.num_restart && finish==0; ol++ ) { - + if( ol == 0 && p->dp.initial_guess_zero ) { - vector_double_copy( p->dp.r, p->dp.b, start, end, l ); + //vector_double_copy( &(p->dp.r), &(p->dp.b), start, end, l ); + vector_double_copy_new( &(p->dp.r), &(p->dp.b), l, threading ); } else { - apply_operator_double( p->dp.r, p->dp.x, &(p->dp), l, threading ); // compute r <- D*x - vector_double_minus( p->dp.r, p->dp.b, p->dp.r, start, end, l ); // compute r <- b - r + apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); // compute r <- D*x + //vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); // compute r <- b - r + vector_double_minus_new( &(p->dp.r), &(p->dp.b), &(p->dp.r), l, threading ); } - gamma0 = (complex_double) global_norm_double( p->dp.r, p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) + //gamma0 = (complex_double) global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r) + global_norm_double_new( gamma0_real, &(p->dp.r), l, threading ); + VECTOR_LOOP(i, n_vect, jj, gamma0[i+jj]=gamma0_real[i+jj];) + START_MASTER(threading) - p->dp.gamma[0] = gamma0; + //p->dp.gamma[0] = gamma0; + VECTOR_LOOP(i, n_vect, jj, p->dp.gamma[i+jj] = gamma0[i+jj];) + END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) if( ol == 0) { if (l->depth == 0 && !p->dp.initial_guess_zero) { - norm_r0 = global_norm_double( p->dp.b, start, end, l, threading ); - printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); + //norm_r0 = global_norm_double( &(p->dp.b), start, end, l, threading ); + global_norm_double_new( norm_r0, &(p->dp.b), l, threading ); + for( i=0; idp.print && g.print > 0 ) { START_MASTER(threading) printf0("+----------------------------------------------------------+\n"); - printf0("| restarting ... true residual norm: %6e |\n", creal(gamma0)/norm_r0 ); + for( i=0; isp.V[0], p->dp.r, l->s_float.op.translation_table, l, threading ); - vector_float_real_scale( p->sp.V[0], p->sp.V[0], (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 - +#endif*/ + trans_float_new( &(p->sp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading ); + //vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0 + VECTOR_LOOP(i, n_vect, jj, gamma_float[i+jj]= (complex_float) p->dp.gamma[0*n_vect+i+jj];) + vector_float_real_scale_new( &(p->sp.V[0]), &(p->sp.V[0]), gamma_float, 0, 1, l, threading ); // inner loop in single precision for( il=0; ildp.restart_length && finish==0; il++) { j = il; iter++; - arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); - - if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) { + arnoldi_step_MP_new( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); + H_tot=0; + VECTOR_LOOP(i, n_vect, jj, H_tot += cabs( p->dp.H[j][(j+1)*n_vect+i+jj] );) + //if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) + if ( H_tot > n_vect*1E-15 ) { qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading ); - gamma_jp1 = cabs( p->dp.gamma[j+1] ); - + //gamma_jp1 = cabs( p->dp.gamma[j+1] ); + VECTOR_LOOP(i, n_vect, jj, gamma_jp1[i+jj] = cabs( p->dp.gamma[(j+1)*n_vect+i+jj] );) + if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) { #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) START_MASTER(threading) if ( p->sp.print && g.print > 0 ) - printf0("| approx. rel. res. after %-6d iterations: %e |\n", iter, gamma_jp1/norm_r0 ); + for( i=0; idp.tol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop + gamma_tot=0; + VECTOR_LOOP(i, n_vect, jj, gamma_tot += gamma_jp1[i+jj]/norm_r0[i+jj];) + + //if( gamma_jp1/norm_r0 < p->dp.tol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop + if( gamma_tot < n_vect*p->dp.tol || gamma_tot > n_vect*1E+5 ) { finish = 1; START_MASTER(threading) - if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_MP, iter = %d, level=%d\n", iter, l->level ); + if ( gamma_tot > n_vect*1E+5 ) printf0("Divergence of fgmres_MP, iter = %d, level=%d\n", iter, l->level ); END_MASTER(threading) } - if( gamma_jp1/creal(gamma0) < p->sp.tol ) + gamma_tot2=0; + VECTOR_LOOP(i, n_vect, jj, gamma_tot2 += gamma_jp1[i+jj]/creal(gamma0[i+jj]);) + //if( gamma_jp1/creal(gamma0) < p->sp.tol ) + if( gamma_tot2 < n_vect*p->sp.tol ){ break; + } } else { finish = 1; } } // end of a single restart - compute_solution_MP( p->sp.w, (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V, + compute_solution_MP_new( &(p->sp.w), (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V, p->dp.y, p->dp.gamma, p->dp.H, j, &(p->sp), l, threading ); - - trans_back_float( p->dp.r, p->sp.w, l->s_float.op.translation_table, l, threading ); + + trans_back_float_new( &(p->dp.r), &(p->sp.w), l->s_float.op.translation_table, l, threading ); if ( ol == 0 ) { - vector_double_copy( p->dp.x, p->dp.r, start, end, l ); + //vector_double_copy( &(p->dp.x), &(p->dp.r), start, end, l ); + vector_double_copy_new(&(p->dp.x), &(p->dp.r), l, threading); } else { - vector_double_plus( p->dp.x, p->dp.x, p->dp.r, start, end, l ); + //vector_double_plus( &(p->dp.x), &(p->dp.x), &(p->dp.r), start, end, l ); + vector_double_plus_new( &(p->dp.x), &(p->dp.x), &(p->dp.r), l, threading ); } } // end of fgmres START_LOCKED_MASTER(threading) - if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_jp1/norm_r0; } + if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_tot; } END_LOCKED_MASTER(threading) if ( p->dp.print ) { #ifdef FGMRES_RESTEST - apply_operator_double( p->dp.r, p->dp.x, &(p->dp), l, threading ); - vector_double_minus( p->dp.r, p->dp.b, p->dp.r, start, end, l ); - beta = global_norm_double( p->dp.r, p->dp.v_start, p->dp.v_end, l, threading ); + apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); + //vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); + vector_double_minus_new( &(p->dp.r), &(p->dp.b), &(p->dp.r), l, threading ); + //beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); + global_norm_double_new( beta, &(p->dp.r), l, threading ); #else - beta = gamma_jp1; + VECTOR_LOOP(i, n_vect, jj, beta[i+jj] = creal(gamma_jp1[i+jj]);) #endif START_MASTER(threading) #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -284,7 +314,8 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { printf0("+----------------------------------------------------------+\n"); printf0("| FGMRES MP iterations: %-6d coarse average: %-6.2lf |\n", iter, ((double)g.coarse_iter_count)/((double)iter) ); - printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta)/norm_r0 ); + for( i=0; i 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -321,7 +352,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { } -void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, +void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float *w, complex_double **H, complex_double* buffer, int j, void (*prec)(), gmres_float_struct *p, level_struct *l, struct Thread *threading ) { @@ -337,19 +368,19 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, if ( prec != NULL ) { if ( p->kind == _LEFT ) { - apply_operator_float( Z[0], V[j], p, l, threading ); - prec( w, NULL, Z[0], _NO_RES, l, threading ); + apply_operator_float( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { - prec( Z[j], w, V[j], _NO_RES, l, threading ); + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); // obtains w = D * Z[j] from Schwarz } else { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_float( w, Z[j], p, l, threading ); // w = D*Z[j] + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_float( w, &Z[j], p, l, threading ); // w = D*Z[j] } } } else { - apply_operator_float( w, V[j], p, l, threading ); // w = D*V[j] + apply_operator_float( w, &V[j], p, l, threading ); // w = D*V[j] } complex_double tmp[j+1]; @@ -383,11 +414,93 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, // V_j+1 = w / H_j+1,j if ( cabs_double( H[j][j+1] ) > 1e-15 ) - vector_float_real_scale( V[j+1], w, (float)(1/H[j][j+1]), start, end, l ); + vector_float_real_scale( &V[j+1], w, (float)(1/H[j][j+1]), start, end, l ); } -void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, +void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w, + complex_double **H, complex_double* buffer, int j, void (*prec)(), + gmres_float_struct *p, level_struct *l, struct Thread *threading ) { + + SYNC_MASTER_TO_ALL(threading) + SYNC_CORES(threading) + int i, n_vect=g.num_rhs_vect, n, jj; + double H_tot; + complex_float H_float[n_vect]; + // start and end indices for vector functions depending on thread + int start; + int end; + // compute start and end indices for core + // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads + //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + + if ( prec != NULL ) { + if ( p->kind == _LEFT ) { + apply_operator_float( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); + } else { + if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); + // obtains w = D * Z[j] from Schwarz + } else { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_float( w, &Z[j], p, l, threading ); // w = D*Z[j] + } + } + } else { + apply_operator_float( w, &V[j], p, l, threading ); // w = D*V[j] + } + + complex_double tmp[(j+1)*n_vect]; + process_multi_inner_product_MP_new( j+1, tmp, V, w, l, threading ); + START_MASTER(threading) + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, buffer[i*n_vect+n+jj] = tmp[i*n_vect+n+jj];) + + if ( g.num_processes > 1 ) { + PROF_double_START( _ALLR ); + MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); + PROF_double_STOP( _ALLR, 1 ); + } else { + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, H[j][i*n_vect+n+jj] = buffer[i*n_vect+n+jj];) + } + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + complex_float alpha[(j+1)*n_vect]; + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, alpha[i*n_vect+n+jj] = (complex_float) H[j][i*n_vect+n+jj];) + for( i=0; i<=j; i++ ) + vector_float_saxpy_new( w, w, &V[i], alpha, i, -1, l, threading ); + /*// orthogonalization + complex_float alpha[(j+1)*n_vect]; + + for( i=0; i<=j; i++ ) + for( n_vec=0; n_vec n_vect*1e-15 ){ + VECTOR_LOOP(n, n_vect, jj, H_float[n+jj]= (complex_float) H[j][(j+1)*n_vect+n+jj];) + vector_float_real_scale_new( &V[j+1], w, H_float, 0, 1, l, threading ); + } +} + + +void compute_solution_MP( vector_float *x, vector_float *V, complex_double *y, complex_double *gamma, complex_double **H, int j, gmres_float_struct *p, level_struct *l, struct Thread *threading ) { @@ -418,12 +531,57 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, SYNC_MASTER_TO_ALL(threading) // x = V*y - vector_float_scale( x, V[0], (complex_float) y[0], start, end, l ); + vector_float_scale( x, &V[0], (complex_float) y[0], start, end, l ); complex_float alpha[j]; for ( i=1; i<=j; i++ ) alpha[i-1] = (complex_float) y[i]; - vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, start, end, l ); + vector_float_multi_saxpy( x, &V[1], alpha, 1, j, start, end, l ); } +void compute_solution_MP_new( vector_float *x, vector_float *V, complex_double *y, + complex_double *gamma, complex_double **H, int j, + gmres_float_struct *p, level_struct *l, struct Thread *threading ) { + + int i, k, n, jj, n_vect=g.num_rhs_vect; + complex_float y_float[n_vect]; + // start and end indices for vector functions depending on thread + //int start; + //int end; + // compute start and end indices for core + // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads + //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + + START_MASTER(threading) + + PROF_double_START( _SMALL2 ); + + // backward substitution + for ( i=j; i>=0; i-- ) { + VECTOR_LOOP(n, n_vect, jj, y[i*n_vect+n+jj] = gamma[i*n_vect+n+jj];) + for ( k=i+1; k<=j; k++ ) { + for ( n=0; nZ = NULL; p->V = NULL; p->H = NULL; - p->x = NULL; - p->b = NULL; - p->r = NULL; - p->w = NULL; + vector_PRECISION_init(&(p->x)); + vector_PRECISION_init(&(p->b)); + vector_PRECISION_init(&(p->r)); + vector_PRECISION_init(&(p->w)); p->y = NULL; p->gamma = NULL; p->c = NULL; @@ -44,7 +44,7 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) { } -void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, const int type, const int prec_kind, +void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION tol, const int type, const int prec_kind, void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ) { /********************************************************************************* @@ -62,7 +62,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co *********************************************************************************/ long int total=0; - int i, k=0; + int i, k=0, n_vl=g.num_rhs_vect; p->restart_length = m; p->num_restart = n; @@ -72,38 +72,34 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co p->kind = prec_kind; #ifdef HAVE_TM1p1 - vl*=2; + n_vl*=2; #endif if(m > 0) { - total += (m+1)*m; // Hessenberg matrix + total += (m+1)*m*n_vl; // Hessenberg matrix MALLOC( p->H, complex_PRECISION*, m ); - total += (5+m)*vl; // x, r, b, w, V - MALLOC( p->V, complex_PRECISION*, m+1 ); + MALLOC( p->V, vector_PRECISION, m+1 ); if ( precond != NULL ) { if ( prec_kind == _RIGHT ) { - total += (m+1)*vl; // Z k = m+1; } else { - total += vl; k = 1; } - MALLOC( p->Z, complex_PRECISION*, k ); + MALLOC( p->Z, vector_PRECISION, k ); } else { #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - total += (m+2)*vl; k = m+2; - MALLOC( p->Z, complex_PRECISION*, k ); + MALLOC( p->Z, vector_PRECISION, k ); } #else k = 0; #endif } - total += 4*(m+1); // y, gamma, c, s + total += 4*(m+1)*n_vl; // y, gamma, c, s p->H[0] = NULL; // allocate connected memory MALLOC( p->H[0], complex_PRECISION, total ); @@ -114,34 +110,36 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co // ordering: H, y, gamma, c, s, w, V, Z, x, r, b // H for ( i=1; iH[i] = p->H[0] + i*(m+1); - total += m*(m+1); + p->H[i] = p->H[0] + i*(m+1)*n_vl; + total += m*(m+1)*n_vl; // y - p->y = p->H[0] + total; total += m+1; + p->y = p->H[0] + total; total += (m+1)*n_vl; // gamma - p->gamma = p->H[0] + total; total += m+1; + p->gamma = p->H[0] + total; total += (m+1)*n_vl; // c - p->c = p->H[0] + total; total += m+1; + p->c = p->H[0] + total; total += (m+1)*n_vl; // s - p->s = p->H[0] + total; total += m+1; + p->s = p->H[0] + total; total += (m+1)*n_vl; // w - p->w = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->w), vl_type, n_vl, l, no_threading ); // V for ( i=0; iV[i] = p->H[0] + total; total += vl; + vector_PRECISION_init(&(p->V[i])); + vector_PRECISION_alloc( &(p->V[i]), vl_type, n_vl, l, no_threading ); } // Z for ( i=0; iZ[i] = p->H[0] + total; total += vl; + vector_PRECISION_init(&(p->Z[i])); + vector_PRECISION_alloc( &(p->Z[i]), vl_type, n_vl, l, no_threading ); } // x - p->x = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->x), vl_type, n_vl, l, no_threading ); // r - p->r = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->r), vl_type, n_vl, l, no_threading ); // b - p->b = p->H[0] + total; total += vl; + vector_PRECISION_alloc( &(p->b), vl_type, n_vl, l, no_threading ); ASSERT( p->total_storage == total ); } @@ -205,10 +203,14 @@ void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l ) if(p->restart_length > 0) { FREE( p->H[0], complex_PRECISION, p->total_storage ); FREE( p->H, complex_PRECISION*, p->restart_length ); - FREE( p->V, complex_PRECISION*, p->restart_length+1 ); - + FREE( p->V, vector_PRECISION, p->restart_length+1 ); + vector_PRECISION_free( &(p->w), l, no_threading ); + vector_PRECISION_free( &(p->x), l, no_threading ); + vector_PRECISION_free( &(p->r), l, no_threading ); + vector_PRECISION_free( &(p->b), l, no_threading ); + if ( p->Z != NULL ) - FREE( p->Z, complex_PRECISION*, k ); + FREE( p->Z, vector_PRECISION, k ); } p->D = NULL; @@ -228,11 +230,17 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread int end; int j=-1, finish=0, iter=0, il, ol, res; - complex_PRECISION gamma0 = 0; - - complex_PRECISION beta = 0; + int n_vect=g.num_rhs_vect, i, jj; + complex_PRECISION gamma0[n_vect];//gamma0 = 0; + + PRECISION beta[n_vect];//complex_PRECISION beta = 0; - PRECISION norm_r0=1, gamma_jp1=1, t0=0, t1=0; + double H_tot; + PRECISION norm_r0[n_vect], gamma_jp1[n_vect], gamma_tot, gamma0_real[n_vect], t0=0, t1=0; + + VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj]=1; + gamma_jp1[i+jj]=1;) + START_LOCKED_MASTER(threading) if ( l->depth==0 && ( p->timing || p->print ) ) prof_init( l ); @@ -247,47 +255,58 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread SYNC_MASTER_TO_ALL(threading) // compute start and end indices for core // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); - - for( ol=0; olnum_restart && finish==0; ol++ ) { - + //compute_core_start_end(p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading); + + SYNC_CORES(threading) + for( ol=0; olnum_restart && finish==0; ol++ ) { if( ol == 0 && p->initial_guess_zero ) { res = _NO_RES; - vector_PRECISION_copy( p->r, p->b, start, end, l ); + //vector_PRECISION_copy( &(p->r), &(p->b), start, end, l ); + vector_PRECISION_copy_new( &(p->r), &(p->b), l, threading ); } else { res = _RES; if ( p->kind == _LEFT && p->preconditioner ) { - apply_operator_PRECISION( p->Z[0], p->x, p, l, threading ); + apply_operator_PRECISION( &(p->Z[0]), &(p->x), p, l, threading ); if ( g.method == 5 ) { START_LOCKED_MASTER(threading) - g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); + //g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); END_LOCKED_MASTER(threading) } - p->preconditioner( p->w, NULL, p->Z[0], _NO_RES, l, threading ); + p->preconditioner( &(p->w), NULL, &(p->Z[0]), _NO_RES, l, threading ); } else { - apply_operator_PRECISION( p->w, p->x, p, l, threading ); // compute w = D*x + apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x } - vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); // compute r = b - w + //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w + vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading ); } - gamma0 = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) + //gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) + global_norm_PRECISION_new( gamma0_real, &(p->r), l, threading ); + + VECTOR_LOOP(i, n_vect, jj, gamma0[i+jj]=gamma0_real[i+jj];) + START_MASTER(threading) - p->gamma[0] = gamma0; + //p->gamma[0] = gamma0; + VECTOR_LOOP(i, n_vect, jj, p->gamma[i+jj] = gamma0[i+jj];) + END_MASTER(threading); SYNC_MASTER_TO_ALL(threading); if ( ol == 0 ) { if (l->depth == 0 && !p->initial_guess_zero) { - norm_r0 = global_norm_PRECISION( p->b, p->v_start, p->v_end, l, threading ); - printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); + //norm_r0 = global_norm_PRECISION( &(p->b), p->v_start, p->v_end, l, threading ); + global_norm_PRECISION_new( norm_r0, &(p->b), l, threading ); + for( i=0; igamma[0]); + //norm_r0 = creal(p->gamma[0]); + VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj] = creal(p->gamma[i+jj]);) } } - - vector_PRECISION_real_scale( p->V[0], p->r, 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 + //vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 + vector_PRECISION_real_scale_new( &(p->V[0]), &(p->r), p->gamma, 0, 1, l, threading ); #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p, l, threading ); + arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, 0, p->preconditioner, p, l, threading ); } #endif @@ -295,79 +314,96 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread j = il; iter++; if ( g.method == 5 ) { START_LOCKED_MASTER(threading) - g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); + //g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); END_LOCKED_MASTER(threading) } // one step of Arnoldi #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+2, j+1 ); break; } } else { - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } } #else - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION_new( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } #endif - - if ( cabs( p->H[j][j+1] ) > p->tol/10 ) { + H_tot=0; + VECTOR_LOOP(i, n_vect, jj, H_tot += cabs( p->H[j][(j+1)*n_vect+i+jj] );) + + //if ( cabs( p->H[j][j+1] ) > p->tol/10 ) + if ( H_tot > n_vect*p->tol/10 ) { qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading ); - gamma_jp1 = cabs( p->gamma[j+1] ); - + //gamma_jp1 = cabs( p->gamma[(j+1)] ); + VECTOR_LOOP(i, n_vect, jj, gamma_jp1[i+jj] = cabs( p->gamma[(j+1)*n_vect+i+jj] );) + #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%10 == 0 || p->preconditioner != NULL || l->depth > 0 ) { START_MASTER(threading) if ( p->print && g.print > 0 ) - printf0("| approx. rel. res. after %-6d iterations: %e |\n", iter, gamma_jp1/norm_r0 ); + for( i=0; itol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop + gamma_tot=0; + VECTOR_LOOP(i, n_vect, jj, gamma_tot += gamma_jp1[i+jj]/norm_r0[i+jj];) + + //if( gamma_jp1/norm_r0 < p->tol || gamma_jp1/norm_r0 > 1E+5 ) // if satisfied ... stop + if( gamma_tot < n_vect*p->tol || gamma_tot > n_vect*1E+5 ) { finish = 1; START_MASTER(threading) - if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level ); + if ( gamma_tot > n_vect*1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level ); END_MASTER(threading) } } else { - printf0("depth: %d, iter: %d, p->H(%d,%d) = %+lf+%lfi\n", l->depth, iter, j+1, j, CSPLIT( p->H[j][j+1] ) ); + for( i=0; iH(%d,%d) = %+lf+%lfi\n", i, l->depth, iter, j+1, j, CSPLIT( p->H[j][(j+1)*n_vect+i] ) ); finish = 1; break; } } // end of a single restart - compute_solution_PRECISION( p->x, (p->preconditioner&&p->kind==_RIGHT)?p->Z:p->V, + /*compute_solution_PRECISION( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), + p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading );*/ + compute_solution_PRECISION_new( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V), p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading ); } // end of fgmres START_LOCKED_MASTER(threading) - if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_jp1/norm_r0; } + if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_tot ; }//= gamma_jp1/norm_r0; } END_LOCKED_MASTER(threading) if ( p->print ) { #ifdef FGMRES_RESTEST - apply_operator_PRECISION( p->w, p->x, p, l, threading ); - vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); - beta = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); + apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); + //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); + vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading ); + //beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading ); + global_norm_PRECISION_new( beta, &(p->r), l, threading ); #else - beta = gamma_jp1; + VECTOR_LOOP(i, n_vect, jj, beta[i+jj] = creal_PRECISION(gamma_jp1[i+jj]);) #endif START_MASTER(threading) - g.norm_res = creal(beta)/norm_r0; + //g.norm_res = creal(beta)/norm_r0; + g.norm_res = 0; + VECTOR_LOOP(i, n_vect, jj, g.norm_res += beta[i+jj]/norm_r0[i+jj];) #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( g.print > 0 ) printf0("+----------------------------------------------------------+\n\n"); #endif printf0("+----------------------------------------------------------+\n"); printf0("| FGMRES iterations: %-6d coarse average: %-6.2lf |\n", iter, ((double)g.coarse_iter_count)/((double)iter) ); - printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta)/norm_r0 ); + for( i=0; i 0 ) printf0("| coarse grid time: %-8.4lf seconds (%04.1lf%%) |\n", @@ -383,7 +419,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if ( l->depth > 0 ) { START_MASTER(threading) char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); - printf0(" - depth: %d, gmres iter: %2d, approx rel res: %le |", l->depth, iter, gamma_jp1/norm_r0 ); + //printf0(" - depth: %d, gmres iter: %2d, approx rel res: %le |", l->depth, iter, gamma_jp1/norm_r0 ); printf0("\033[0m\n"); fflush(0); END_MASTER(threading) } @@ -411,8 +447,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread START_MASTER(threading) if ( g.method != 6 ) prof_print( l ); END_MASTER(threading) - } - + } return iter; } @@ -439,13 +474,13 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr maxiter = 1000000; r = ps->r; b = ps->b; x = ps->x; p = ps->w; pp = ps->V[0]; r_tilde = ps->V[1]; v = ps->V[2]; s = ps->V[3]; t = ps->V[4]; - vector_PRECISION_copy( r, b, start, end, l ); - vector_PRECISION_copy( r_tilde, b, start, end, l ); - vector_PRECISION_define( x, 0, start, end, l ); - vector_PRECISION_define( v, 0, start, end, l ); - vector_PRECISION_define( s, 0, start, end, l ); - vector_PRECISION_define( t, 0, start, end, l ); - b_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_copy( &r, &b, start, end, l ); + vector_PRECISION_copy( &r_tilde, &b, start, end, l ); + vector_PRECISION_define( &x, 0, start, end, l ); + vector_PRECISION_define( &v, 0, start, end, l ); + vector_PRECISION_define( &s, 0, start, end, l ); + vector_PRECISION_define( &t, 0, start, end, l ); + b_norm = global_norm_PRECISION( &b, ps->v_start, ps->v_end, l, threading ); r_norm = b_norm; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -457,7 +492,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr iter++; rho_old = rho; - rho = global_inner_product_PRECISION( r_tilde, r, ps->v_start, ps->v_end, l, threading ); + rho = global_inner_product_PRECISION( &r_tilde, &r, ps->v_start, ps->v_end, l, threading ); if ( rho == 0 ) { START_MASTER(threading) @@ -467,31 +502,31 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr } if ( iter == 1 ) { - vector_PRECISION_copy( p, r, start, end, l ); + vector_PRECISION_copy( &p, &r, start, end, l ); } else { beta = (rho/rho_old)*(alpha/omega); - vector_PRECISION_saxpy( pp, p, v, -omega, start, end, l ); - vector_PRECISION_saxpy( p, r, pp, beta, start, end, l ); + vector_PRECISION_saxpy( &pp, &p, &v, -omega, start, end, l ); + vector_PRECISION_saxpy( &p, &r, &pp , beta, start, end, l ); } - apply_operator_PRECISION( v, p, ps, l, threading ); - alpha = rho / global_inner_product_PRECISION( r_tilde, v, ps->v_start, ps->v_end, l, threading ); - vector_PRECISION_saxpy( s, r, v, -alpha, start, end, l ); - s_norm = global_norm_PRECISION( s, ps->v_start, ps->v_end, l, threading ); + apply_operator_PRECISION( &v, &p, ps, l, threading ); + alpha = rho / global_inner_product_PRECISION( &r_tilde, &v, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( &s, &r, &v, -alpha, start, end, l ); + s_norm = global_norm_PRECISION( &s, ps->v_start, ps->v_end, l, threading ); if ( s_norm/b_norm < tol ) { - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); break; } - apply_operator_PRECISION( t, s, ps, l, threading ); - omega = global_inner_product_PRECISION( t, s, ps->v_start, ps->v_end, l, threading ) - / global_inner_product_PRECISION( t, t, ps->v_start, ps->v_end, l, threading ); + apply_operator_PRECISION( &t, &s, ps, l, threading ); + omega = global_inner_product_PRECISION( &t, &s, ps->v_start, ps->v_end, l, threading ) + / global_inner_product_PRECISION( &t, &t, ps->v_start, ps->v_end, l, threading ); - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( x, x, s, omega, start, end, l ); - vector_PRECISION_saxpy( r, s, t, -omega, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &s, omega, start, end, l ); + vector_PRECISION_saxpy( &r, &s, &t, -omega, start, end, l ); - r_norm = global_norm_PRECISION( r, ps->v_start, ps->v_end, l, threading ); + r_norm = global_norm_PRECISION( &r, ps->v_start, ps->v_end, l, threading ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) START_MASTER(threading) @@ -537,16 +572,15 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads compute_core_start_end(ps->v_start, ps->v_end, &start, &end, l, threading); - vector_PRECISION_define( x, 0, start, end, l ); - apply_operator_PRECISION( Dp, x, ps, l, threading ); - vector_PRECISION_minus( pp, b, Dp, start, end, l ); - apply_operator_dagger_PRECISION( r_old, pp, ps, l, threading ); + vector_PRECISION_define( &x, 0, start, end, l ); + apply_operator_PRECISION( &Dp, &x, ps, l, threading ); + vector_PRECISION_minus( &pp, &b, &Dp, start, end, l ); + apply_operator_dagger_PRECISION( &r_old, &pp, ps, l, threading ); - vector_PRECISION_copy( p, r_old, start, end, l ); - r0_norm = global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_copy( &p, &r_old, start, end, l ); + r0_norm = global_norm_PRECISION( &r_old, ps->v_start, ps->v_end, l, threading ); // prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading ); prod_rr_old = r0_norm*r0_norm; - #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( ps->print ) { START_MASTER(threading) @@ -557,19 +591,19 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * while ( sqrt(prod_rr_old) / r0_norm > tol && iter < maxiter ) { iter++; - apply_operator_PRECISION( pp, p, ps, l, threading ); - apply_operator_dagger_PRECISION( Dp, pp, ps, l, threading ); + apply_operator_PRECISION( &pp, &p, ps, l, threading ); + apply_operator_dagger_PRECISION( &Dp, &pp, ps, l, threading ); - gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &p, &Dp, ps->v_start, ps->v_end, l, threading ); alpha = prod_rr_old / gamma; - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); + vector_PRECISION_saxpy( &r_new, &r_old, &Dp, -alpha, start, end, l ); - gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &r_new, &r_new, ps->v_start, ps->v_end, l, threading ); beta = gamma / prod_rr_old; - vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l ); - vector_PRECISION_copy( r_old, r_new, start, end, l ); + vector_PRECISION_saxpy( &p, &r_new, &p, beta, start, end, l ); + vector_PRECISION_copy( &r_old, &r_new, start, end, l ); prod_rr_old = gamma; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%100 == 0 && ps->print >=1 ) { @@ -580,10 +614,10 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * #endif } - r0_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ); - apply_operator_PRECISION( Dp, x, ps, l, threading ); - vector_PRECISION_minus( r_true, b, Dp, start, end, l ); - r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ); + r0_norm = global_norm_PRECISION( &b, ps->v_start, ps->v_end, l, threading ); + apply_operator_PRECISION( &Dp, &x, ps, l, threading ); + vector_PRECISION_minus( &r_true, &b, &Dp, start, end, l ); + r_norm = global_norm_PRECISION( &r_true, ps->v_start, ps->v_end, l, threading ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( ps->print ) { @@ -598,22 +632,22 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * while ( r_norm / r0_norm > tol && iter < maxiter ) { iter++; - apply_operator_PRECISION( pp, p, ps, l, threading ); - apply_operator_dagger_PRECISION( Dp, pp, ps, l, threading ); + apply_operator_PRECISION( &pp, &p, ps, l, threading ); + apply_operator_dagger_PRECISION( &Dp, &pp, ps, l, threading ); - gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &p, &Dp, ps->v_start, ps->v_end, l, threading ); alpha = prod_rr_old / gamma; - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l ); + vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l ); + vector_PRECISION_saxpy( &r_new, &r_old, &Dp, -alpha, start, end, l ); // residual update - vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, start, end, l ); - r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ); - gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( &r_true, &r_true, &pp, -alpha, start, end, l ); + r_norm = global_norm_PRECISION( &r_true, ps->v_start, ps->v_end, l, threading ); + gamma = global_inner_product_PRECISION( &r_new, &r_new, ps->v_start, ps->v_end, l, threading ); beta = gamma / prod_rr_old; - vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l ); - vector_PRECISION_copy( r_old, r_new, start, end, l ); + vector_PRECISION_saxpy( &p, &r_new, &p, beta, start, end, l ); + vector_PRECISION_copy( &r_old, &r_new, start, end, l ); prod_rr_old = gamma; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( iter%100 == 0 && ps->print >=1 ) { @@ -630,10 +664,10 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * printf0("+----------------------------------------------------------+\n"); printf0("| CGN iterations: %-6d |\n", iter ); END_MASTER(threading) - apply_operator_PRECISION( Dp, x, ps, l, threading ); - vector_PRECISION_minus( pp, b, Dp, start, end, l ); + apply_operator_PRECISION( &Dp, &x, ps, l, threading ); + vector_PRECISION_minus( &pp, &b, &Dp, start, end, l ); - beta = global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading ); + beta = global_norm_PRECISION( &pp, ps->v_start, ps->v_end, l, threading ); START_MASTER(threading) if ( ps->timing ) printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta/r0_norm) ); printf0("| elapsed wall clock time: %-12g seconds |\n", t1-t0 ); @@ -658,15 +692,15 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * } -int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w, +int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Extends the Arnoldi basis by one vector. -* - vector_PRECISION *V: Contains the Arnoldi basis vectors. -* - vector_PRECISION *Z: If a right precond. P is used, contains P*V[j] for all j. -* - vector_PRECISION w: Will be appended to existing Arnoldi basis at +* - vector_PRECISION **V: Contains the Arnoldi basis vectors. +* - vector_PRECISION **Z: If a right precond. P is used, contains P*V[j] for all j. +* - vector_PRECISION *w: Will be appended to existing Arnoldi basis at * position j+1. * - complex_PRECISION **H: Contains full Hessenberg matrix from the Arnoldi * decomposition (columnmajor!) @@ -688,12 +722,12 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); if ( j == 0 ) - vector_PRECISION_copy( Z[0], V[0], start, end, l ); + vector_PRECISION_copy( &Z[0], &V[0], start, end, l ); else - vector_PRECISION_copy( V[j], Z[j], start, end, l ); + vector_PRECISION_copy( &V[j], &Z[j], start, end, l ); complex_PRECISION tmp[j+1]; - process_multi_inner_product_PRECISION( j+1, tmp, V, V[j], p->v_start, p->v_end, l, threading ); + process_multi_inner_product_PRECISION( j+1, tmp, V, &V[j], p->v_start, p->v_end, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _ALLR ); for( i=0; i<=j; i++ ) @@ -708,7 +742,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE PROF_PRECISION_STOP( _ALLR, 1 ); END_MASTER(threading) - apply_operator_PRECISION( Z[j+1], Z[j], p, l, threading ); + apply_operator_PRECISION( &Z[j+1], &Z[j], p, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _ALLR ); @@ -725,8 +759,8 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) for( i=0; i 0 ) { @@ -736,13 +770,13 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) if ( j == 0 ) { - if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[j], -sigma, start, end, l ); } else { for( i=0; ikind == _LEFT ) { - apply_operator_PRECISION( Z[0], V[j], p, l, threading ); - prec( V[j+1], NULL, Z[0], _NO_RES, l, threading ); - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( &V[j+1], NULL, &Z[0], _NO_RES, l, threading ); + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); } else { if ( l->level == 0 ) { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { - prec( Z[j], V[j+1], V[j], _NO_RES, l, threading ); + prec( &Z[j], &V[j+1], &V[j], _NO_RES, l, threading ); // obtains w = D * Z[j] from Schwarz } else { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); // w = D*Z[j] + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); // w = D*Z[j] } } - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); } } else { - apply_operator_PRECISION( V[j+1], V[j], p, l, threading ); // w = D*V[j] - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + apply_operator_PRECISION( &V[j+1], &V[j], p, l, threading ); // w = D*V[j] + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); } complex_PRECISION tmp[j+2]; - process_multi_inner_product_PRECISION( j+2, tmp, V, V[j+1], p->v_start, p->v_end, l, threading ); + process_multi_inner_product_PRECISION( j+2, tmp, V, &V[j+1], p->v_start, p->v_end, l, threading ); START_MASTER(threading) for( i=0; i<=j+1; i++ ) buffer[i] = tmp[i]; @@ -804,8 +838,8 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], start, end, l ); - vector_PRECISION_real_scale( V[j+1], V[j+1], 1/H[j][j+1], start, end, l ); + vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[i], -H[j][i], start, end, l ); + vector_PRECISION_real_scale( &V[j+1], &V[j+1], 1/H[j][j+1], start, end, l ); START_LOCKED_MASTER(threading) H[j][j] += sigma; END_LOCKED_MASTER(threading) @@ -824,23 +858,23 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE if ( prec != NULL ) { if ( p->kind == _LEFT ) { - apply_operator_PRECISION( Z[0], V[j], p, l, threading ); - prec( w, NULL, Z[0], _NO_RES, l, threading ); + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); } else { if ( l->level == 0 ) { - apply_operator_PRECISION( w, Z[j], p, l, threading ); + apply_operator_PRECISION( w, &Z[j], p, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { - prec( Z[j], w, V[j], _NO_RES, l, threading ); + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); // obtains w = D * Z[j] from Schwarz } else { - prec( Z[j], NULL, V[j], _NO_RES, l, threading ); - apply_operator_PRECISION( w, Z[j], p, l, threading ); // w = D*Z[j] + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( w, &Z[j], p, l, threading ); // w = D*Z[j] } } } } else { - apply_operator_PRECISION( w, V[j], p, l, threading ); // w = D*V[j] + apply_operator_PRECISION( w, &V[j], p, l, threading ); // w = D*V[j] } // orthogonalization @@ -860,7 +894,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( w, w, V[i], -H[j][i], start, end, l ); + vector_PRECISION_saxpy( w, w, &V[i], -H[j][i], start, end, l ); #ifdef REORTH // re-orthogonalization process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start, p->v_end, l, threading ); @@ -879,7 +913,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( w, w, V[i], -tmp[i], start, end, l ); + vector_PRECISION_saxpy( w, w, &V[i], -tmp[i], start, end, l ); #endif // normalization @@ -891,7 +925,256 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE // V_j+1 = w / H_j+1,j if ( cabs_PRECISION( H[j][j+1] ) > 1e-15 ) - vector_PRECISION_real_scale( V[j+1], w, 1/H[j][j+1], start, end, l ); + vector_PRECISION_real_scale( &V[j+1], w, 1/H[j][j+1], start, end, l ); +#endif + return 1; +} + + + +int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, + complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + +/********************************************************************************* +* Extends the Arnoldi basis by one vector. +* - vector_PRECISION **V: Contains the Arnoldi basis vectors. +* - vector_PRECISION **Z: If a right precond. P is used, contains P*V[j] for all j. +* - vector_PRECISION *w: Will be appended to existing Arnoldi basis at +* position j+1. +* - complex_PRECISION **H: Contains full Hessenberg matrix from the Arnoldi +* decomposition (columnmajor!) +* - complex_PRECISION* buffer: Buffer for local inner products. +* - int j: index of the new Arnoldi vector to be orthonormalized +* against all previous ones. +* - void (*prec)(): Function pointer to preconditioner (can be NULL if no +* preconditioning is used). +*********************************************************************************/ +#ifdef SINGLE_ALLREDUCE_ARNOLDI +#ifdef PIPELINED_ARNOLDI + if ( l->level == 0 && l->depth > 0 ) { + SYNC_MASTER_TO_ALL(threading) + SYNC_CORES(threading) + MPI_Request req; + MPI_Status stat; + int start, end, i; + const complex_PRECISION sigma = 0; + compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + + if ( j == 0 ) + vector_PRECISION_copy( &Z[0], &V[0], start, end, l ); + else + vector_PRECISION_copy( &V[j], &Z[j], start, end, l ); + + complex_PRECISION tmp[j+1]; + process_multi_inner_product_PRECISION( j+1, tmp, V, &V[j], p->v_start, p->v_end, l, threading ); + START_MASTER(threading) + PROF_PRECISION_START( _ALLR ); + for( i=0; i<=j; i++ ) + buffer[i] = tmp[i]; + if ( g.num_processes > 1 ) { + MPI_Iallreduce( buffer, H[MAX(0,j-1)], j+1, MPI_COMPLEX_PRECISION, MPI_SUM, + (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm, &req ); + } else { + for( i=0; i<=j; i++ ) + H[MAX(0,j-1)][i] = buffer[i]; + } + PROF_PRECISION_STOP( _ALLR, 1 ); + END_MASTER(threading) + + apply_operator_PRECISION( &Z[j+1], &Z[j], p, l, threading ); + + START_MASTER(threading) + PROF_PRECISION_START( _ALLR ); + if ( g.num_processes > 1 ) { + MPI_Wait( &req, &stat ); + } + PROF_PRECISION_STOP( _ALLR, 0 ); + if ( j > 0 ) { + for ( i=0; i 0 ) { + H[j-1][j-1] += sigma; + } + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + if ( j == 0 ) { + if ( sigma ) vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[j], -sigma, start, end, l ); + } else { + for( i=0; iv_start, p->v_end, &start, &end, l, threading); + + if ( prec != NULL ) { + if ( p->kind == _LEFT ) { + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( &V[j+1], NULL, &Z[0], _NO_RES, l, threading ); + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); + } else { + if ( l->level == 0 ) { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); + } else { + if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { + prec( &Z[j], &V[j+1], &V[j], _NO_RES, l, threading ); + // obtains w = D * Z[j] from Schwarz + } else { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); // w = D*Z[j] + } + } + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); + + } + } else { + apply_operator_PRECISION( &V[j+1], &V[j], p, l, threading ); // w = D*V[j] + if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l ); + } + + complex_PRECISION tmp[j+2]; + process_multi_inner_product_PRECISION( j+2, tmp, V, &V[j+1], p->v_start, p->v_end, l, threading ); + START_MASTER(threading) + for( i=0; i<=j+1; i++ ) + buffer[i] = tmp[i]; + + if ( g.num_processes > 1 ) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, H[j], j+2, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + } else { + for( i=0; i<=j+1; i++ ) + H[j][i] = buffer[i]; + } + for ( i=0; i<=j; i++ ) + H[j][j+1] -= conj( H[j][i] )*H[j][i]; + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + if ( creal( H[j][j+1] ) < 0 ) + return 0; + START_MASTER(threading) + H[j][j+1] = sqrt( creal( H[j][j+1] ) ); + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + for( i=0; i<=j; i++ ) + vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[i], -H[j][i], start, end, l ); + vector_PRECISION_real_scale( &V[j+1], &V[j+1], 1/H[j][j+1], start, end, l ); + START_LOCKED_MASTER(threading) + H[j][j] += sigma; + END_LOCKED_MASTER(threading) +#ifdef PIPELINED_ARNOLDI + } +#endif +#else + SYNC_MASTER_TO_ALL(threading) + SYNC_CORES(threading) + int i, n_vect=g.num_rhs_vect, n, jj; + PRECISION H_tot; + // start and end indices for vector functions depending on thread + int start, end; + // compute start and end indices for core + // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads + //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); + + if ( prec != NULL ) { + if ( p->kind == _LEFT ) { + apply_operator_PRECISION( &Z[0], &V[j], p, l, threading ); + prec( w, NULL, &Z[0], _NO_RES, l, threading ); + } else { + if ( l->level == 0 ) { + apply_operator_PRECISION( w, &Z[j], p, l, threading ); + } else { + if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { + prec( &Z[j], w, &V[j], _NO_RES, l, threading ); + // obtains w = D * Z[j] from Schwarz + } else { + prec( &Z[j], NULL, &V[j], _NO_RES, l, threading ); + apply_operator_PRECISION( w, &Z[j], p, l, threading ); // w = D*Z[j] + } + } + } + } else { + apply_operator_PRECISION( w, &V[j], p, l, threading ); // w = D*V[j] + } + + // orthogonalization + complex_PRECISION tmp[(j+1)*n_vect]; + process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); + START_MASTER(threading) + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, buffer[i*n_vect+n+jj] = tmp[i*n_vect+n+jj];) + + if ( g.num_processes > 1 ) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + } else { + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, H[j][i*n_vect+n+jj] = buffer[i*n_vect+n+jj];) + } + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + for( i=0; i<=j; i++ ) + vector_PRECISION_saxpy_new( w, w, &V[i], H[j], i, -1, l, threading ); + +#ifdef REORTH + // re-orthogonalization + process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading ); + START_MASTER(threading) + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, buffer[i*n_vect+n+jj] = tmp[i*n_vect+n+jj];) + + if ( g.num_processes > 1 ) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, tmp, (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + } + + for( i=0; i<=j; i++ ) + VECTOR_LOOP(n, n_vect, jj, H[j][i*n_vect+n+jj] += tmp[i*n_vect+n+jj];) + + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + for( i=0; i<=j; i++ ) + vector_PRECISION_saxpy_new( w, w, &V[i], tmp, i, -1, l, threading ); +#endif + + // normalization + PRECISION tmp2[n_vect]; + global_norm_PRECISION_new( tmp2, w, l, threading ); + START_MASTER(threading) + + VECTOR_LOOP(n, n_vect, jj, H[j][(j+1)*n_vect+n+jj] = tmp2[n+jj];) + + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + + // V_j+1 = w / H_j+1,j + H_tot=0; + VECTOR_LOOP(n, n_vect, jj, H_tot += cabs_PRECISION( p->H[j][(j+1)*n_vect+n+jj] );) + if ( H_tot > n_vect*1e-15 ) + vector_PRECISION_real_scale_new( &V[j+1], w, H[j], j+1, 1, l, threading ); #endif return 1; } @@ -917,23 +1200,29 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, PROF_PRECISION_START( _SMALL1 ); - int i; - complex_PRECISION beta; + int i, n, jj, n_vect=g.num_rhs_vect; + complex_PRECISION beta[n_vect]; // update QR factorization // apply previous Givens rotation - for ( i=0; iv_start+x->size*n_vec, p->v_end+x->size*n_vec, &start, &end, l, threading); + + START_MASTER(threading) + + PROF_PRECISION_START( _SMALL2 ); + + // backward substitution + for ( i=j; i>=0; i-- ) { + VECTOR_LOOP(n, n_vect, jj, y[i*n_vect+n+jj] = gamma[i*n_vect+n+jj];) + for ( k=i+1; k<=j; k++ ) { + for ( n=0; nnum_lattice_site_var, n = l->block_iter, end = (g.odd_even&&l->depth==0)?(start+nv*s->num_block_even_sites):(start+s->block_vector_size); - vector_PRECISION Dr = s->local_minres_buffer[0]; - vector_PRECISION r = s->local_minres_buffer[1]; - vector_PRECISION lphi = s->local_minres_buffer[2]; + vector_PRECISION Dr, r, lphi; + Dr.vector_buffer = s->local_minres_buffer[0]; + r.vector_buffer = s->local_minres_buffer[1]; + lphi.vector_buffer = s->local_minres_buffer[2]; complex_PRECISION alpha; void (*block_op)() = (l->depth==0)?(g.odd_even?apply_block_schur_complement_PRECISION:block_d_plus_clover_PRECISION) :coarse_block_operator_PRECISION; - - vector_PRECISION_copy( r, eta, start, end, l ); - vector_PRECISION_define( lphi, 0, start, end, l ); + + vector_PRECISION_copy( &r, eta, start, end, l ); + vector_PRECISION_define( &lphi, 0, start, end, l ); for ( i=0; i/ - alpha = local_xy_over_xx_PRECISION( Dr, r, start, end, l ); + alpha = local_xy_over_xx_PRECISION( &Dr, &r, start, end, l ); // phi += alpha * r - vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l ); + vector_PRECISION_saxpy( &lphi, &lphi, &r, alpha, start, end, l ); // r -= alpha * Dr - vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l ); + vector_PRECISION_saxpy( &r, &r, &Dr, -alpha, start, end, l ); } - if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, lphi, start, end, l ); - if ( phi != NULL ) vector_PRECISION_plus( phi, phi, lphi, start, end, l ); - vector_PRECISION_copy( eta, r, start, end, l ); - + if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, &lphi, start, end, l ); + if ( phi != NULL ) vector_PRECISION_plus( phi, phi, &lphi, start, end, l ); + vector_PRECISION_copy( eta, &r, start, end, l ); + END_UNTHREADED_FUNCTION(threading) } @@ -1051,36 +1388,36 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { for( ol=0; olnum_restart && finish==0; ol++ ) { if( ol == 0 && p->initial_guess_zero ) { - vector_PRECISION_copy( p->r, p->b, p->v_start, p->v_end, l ); + vector_PRECISION_copy( &(p->r), &(p->b), p->v_start, p->v_end, l ); } else { - apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); // compute w = D*x - vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); // compute r = b - w + apply_operator_PRECISION( &(p->w), &(p->x), p, l, no_threading ); // compute w = D*x + vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), p->v_start, p->v_end, l ); // compute r = b - w } if( ol == 0) { - r0_norm = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ); + r0_norm = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ); } for( il=0; ilrestart_length && finish==0; il++ ) { j = il; iter++; - p->preconditioner( p->V[j], p->r, _NO_RES, l, no_threading ); - apply_operator_PRECISION( p->Z[j], p->V[j], p, l, no_threading ); + p->preconditioner( &(p->V[j]), &(p->r), _NO_RES, l, no_threading ); + apply_operator_PRECISION( &(p->Z[j]), &(p->V[j]), p, l, no_threading ); for( i=0; iZ[i], p->Z[j], p->v_start, p->v_end, l, no_threading ) / p->gamma[i]; - vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l ); - vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l ); + beta = global_inner_product_PRECISION( &(p->Z[i]), &(p->Z[j]), p->v_start, p->v_end, l, no_threading ) / p->gamma[i]; + vector_PRECISION_saxpy( &(p->V[j]), &(p->V[j]), &(p->V[i]), -beta, p->v_start, p->v_end, l ); + vector_PRECISION_saxpy( &(p->Z[j]), &(p->Z[j]), &(p->Z[i]), -beta, p->v_start, p->v_end, l ); } - p->gamma[j] = global_inner_product_PRECISION( p->Z[j], p->Z[j], p->v_start, p->v_end, l, no_threading ); - alpha = global_inner_product_PRECISION( p->Z[j], p->r, p->v_start, p->v_end, l, no_threading ) / p->gamma[j]; - vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l ); - vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l ); + p->gamma[j] = global_inner_product_PRECISION( &(p->Z[j]), &(p->Z[j]), p->v_start, p->v_end, l, no_threading ); + alpha = global_inner_product_PRECISION( &(p->Z[j]), &(p->r), p->v_start, p->v_end, l, no_threading ) / p->gamma[j]; + vector_PRECISION_saxpy( &(p->x), &(p->x), &(p->V[j]), alpha, p->v_start, p->v_end, l ); + vector_PRECISION_saxpy( &(p->r), &(p->r), &(p->Z[j]), -alpha, p->v_start, p->v_end, l ); - alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / r0_norm; + alpha = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ) / r0_norm; if ( creal(alpha) < p->tol ) { finish = 1; break; @@ -1094,9 +1431,9 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { if ( p->timing || p->print ) t1 = MPI_Wtime(); if ( p->print ) { - apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); - vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); - beta = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ); + apply_operator_PRECISION( &(p->w), &(p->x), p, l, no_threading ); + vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), p->v_start, p->v_end, l ); + beta = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) printf0("+----------------------------------------------------------+\n"); printf0("\n"); diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h index 8a1f2e8..c44cbb5 100644 --- a/src/linsolve_generic.h +++ b/src/linsolve_generic.h @@ -25,23 +25,27 @@ struct Thread; void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ); - void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, const int type, const int prec_kind, - void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct* l ); + void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION tol, const int type, const int prec_kind, + void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ); void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l ); int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ); void cgn_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *threading ); - void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter, + void local_minres_PRECISION( vector_PRECISION *phi, vector_PRECISION *eta, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w, + int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, + complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); + int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w, complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, complex_PRECISION *c, complex_PRECISION *gamma, int j, level_struct *l, struct Thread *threading ); - void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, + void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, + complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); + void compute_solution_PRECISION_new( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); - #endif diff --git a/src/main.c b/src/main.c index ef2c3cb..7e67545 100644 --- a/src/main.c +++ b/src/main.c @@ -89,7 +89,7 @@ int main( int argc, char **argv ) { solve_driver( &l, &threading ); } - + printf0("Number of rhs vectors = %d\n", g.num_rhs_vect); finalize_common_thread_data(commonthreaddata); finalize_no_threading(no_threading); free(commonthreaddata); diff --git a/src/main.h b/src/main.h index cf15fde..ebc9e51 100644 --- a/src/main.h +++ b/src/main.h @@ -32,6 +32,10 @@ #ifndef MAIN_HEADER #define MAIN_HEADER + #define num_loop 4 + + #define VECTOR_LOOP(j, jmax, jj, instructions) for( j=0; j 0 ) { variable = (kind*) memalign( 64, sizeof(kind) * (length) ); } \ - if ( variable == NULL && (length) > 0 ) { \ - error0("malloc of \"%s\" failed: no memory allocated (%s:%d), current memory used: %lf GB.\n", \ - #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \ - g.cur_storage += (sizeof(kind) * (length))/(1024.0*1024.0); \ - if ( g.cur_storage > g.max_storage ) g.max_storage = g.cur_storage; }while(0) -#else #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \ printf0("malloc of \"%s\" failed: pointer is not NULL (%s:%d).\n", #variable, __FILE__, __LINE__ ); } \ if ( (length) > 0 ) { variable = (kind*) malloc( sizeof(kind) * (length) ); } \ @@ -102,7 +96,6 @@ #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \ g.cur_storage += (sizeof(kind) * (length))/(1024.0*1024.0); \ if ( g.cur_storage > g.max_storage ) g.max_storage = g.cur_storage; }while(0) -#endif #define FREE( variable, kind, length ) do{ if ( variable != NULL ) { \ free( variable ); variable = NULL; g.cur_storage -= (sizeof(kind) * (length))/(1024.0*1024.0); } else { \ @@ -180,6 +173,9 @@ #else #define DEBUGOUTPUT( A, FORMAT ) #endif + + #define INDEX_NV_LV_SV( NV, NUM_NV, LV, NUM_LV, SV, NUM_SV ) SV+NUM_SV*LV+NUM_SV*NUM_LV*NV + #define INDEX_LV_SV_NV( NV, NUM_NV, LV, NUM_LV, SV, NUM_SV ) NV+NUM_NV*SV+NUM_NV*NUM_SV*LV #include "vectorization_control.h" #include "threading.h" @@ -189,7 +185,7 @@ enum { _NO_DEFAULT_SET, _DEFAULT_SET }; enum { _NO_REORDERING, _REORDER }; enum { _ADD, _COPY }; - enum { _ORDINARY, _SCHWARZ, _ODDEVEN }; + enum { _ORDINARY, _SCHWARZ, _ODDEVEN, _INNER }; enum { _RES, _NO_RES }; enum { _STANDARD, _LIME }; //formats enum { _READ, _WRITE }; @@ -201,10 +197,11 @@ enum { _LEFT, _RIGHT, _NOTHING }; enum { _PERIODIC, _ANTIPERIODIC, _TWISTED, _DIRICHLET }; enum { _GIP, _PIP, _LA2, _LA6, _LA8, _LA, _CPY, _SET, _PR, _SC, _NC, _SM, _OP_COMM, _OP_IDLE, _ALLR, _GD_COMM, _GD_IDLE, _GRAM_SCHMIDT, _GRAM_SCHMIDT_ON_AGGREGATES, - _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant! + _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _RS, _NUM_PROF }; // _NUM_PROF has always to be the last constant! enum { _VTS = 20 }; enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB }; - + enum { _NV_LV_SV, _LV_SV_NV }; //vector layout + typedef struct block_struct { int start, color, no_comm, *bt; } block_struct; @@ -392,6 +389,10 @@ // bc: 0 dirichlet, 1 periodic, 2 anti-periodic int bc; + // number of rhs vectors (b) to be solved at the same time (hopefully) + int num_rhs_vect; + + complex_double **gamma; var_table vt; @@ -481,24 +482,8 @@ // functions #include "clifford.h" -#ifdef SSE -#include "vectorization_dirac_float.h" -#include "vectorization_dirac_double.h" -#include "blas_vectorized.h" -#include "sse_blas_vectorized.h" -#include "sse_complex_float_intrinsic.h" -#include "sse_complex_double_intrinsic.h" -#include "sse_coarse_operator_float.h" -#include "sse_coarse_operator_double.h" -#include "sse_linalg_float.h" -#include "sse_linalg_double.h" -#include "sse_interpolation_float.h" -#include "sse_interpolation_double.h" -#else -//no intrinsics #include "interpolation_float.h" #include "interpolation_double.h" -#endif #include "data_float.h" #include "data_double.h" @@ -543,6 +528,8 @@ #include "var_table.h" #include "main_post_def_float.h" #include "main_post_def_double.h" +#include "vector_float.h" +#include "vector_double.h" #ifdef HAVE_LIME #include #include diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h index 690ef6b..4817c43 100644 --- a/src/main_post_def_generic.h +++ b/src/main_post_def_generic.h @@ -26,35 +26,35 @@ #include "dirac_PRECISION.h" #include "coarse_operator_PRECISION.h" - static inline void apply_operator_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + static inline void apply_operator_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { p->eval_operator( output, input, p->op, l, threading ); } - static inline void apply_operator_dagger_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + static inline void apply_operator_dagger_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { - tau1_gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading ); + tau1_gamma5_PRECISION( &(l->vbuf_PRECISION[6]), input, l, threading ); } else #endif { - gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading ); + gamma5_PRECISION( &(l->vbuf_PRECISION[6]), input, l, threading ); #ifdef HAVE_TM //TODO: change_mu_sign_PRECISION( p->op, l, threading ); #endif } - apply_operator_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], p, l, threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[7]), &(l->vbuf_PRECISION[6]), p, l, threading ); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { - tau1_gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading ); + tau1_gamma5_PRECISION( output,&(l->vbuf_PRECISION[7]), l, threading ); } else #endif { - gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading ); + gamma5_PRECISION( output, &(l->vbuf_PRECISION[7]), l, threading ); #ifdef HAVE_TM //TODO: change_mu_sign_PRECISION( p->op, l, threading ); #endif diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index d485518..521e5e8 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -21,17 +21,26 @@ #ifndef MAIN_PRE_DEF_PRECISION_HEADER #define MAIN_PRE_DEF_PRECISION_HEADER - + typedef PRECISION complex complex_PRECISION; typedef PRECISION complex *config_PRECISION; - typedef PRECISION complex *vector_PRECISION; + typedef PRECISION complex *buffer_PRECISION; + + typedef struct { + buffer_PRECISION vector_buffer; + int num_vect; + int layout; + int type; + int size; + struct level_struct *l; + } vector_PRECISION; typedef struct { int length[8], *boundary_table[8], max_length[4], comm_start[8], in_use[8], offset, comm, num_even_boundary_sites[8], num_odd_boundary_sites[8], num_boundary_sites[8]; - vector_PRECISION buffer[8]; + buffer_PRECISION buffer[8]; MPI_Request sreqs[8], rreqs[8]; } comm_PRECISION_struct; @@ -52,12 +61,9 @@ *index_table, *neighbor_table, *translation_table, table_dim[4], *backward_neighbor_table, table_mod_dim[4], *config_boundary_table[4]; - vector_PRECISION *buffer, prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX; + vector_PRECISION *buffer; + buffer_PRECISION prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX; comm_PRECISION_struct c; - OPERATOR_TYPE_PRECISION *D_vectorized; - OPERATOR_TYPE_PRECISION *D_transformed_vectorized; - OPERATOR_TYPE_PRECISION *clover_vectorized; - OPERATOR_TYPE_PRECISION *clover_oo_inv_vectorized; #ifdef HAVE_TM double mu, mu_odd_shift, mu_even_shift; config_PRECISION tm_term; @@ -65,8 +71,6 @@ #ifdef HAVE_TM1p1 double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift; config_PRECISION epsbar_term, clover_doublet_oo_inv; - OPERATOR_TYPE_PRECISION *clover_doublet_vectorized; - OPERATOR_TYPE_PRECISION *clover_doublet_oo_inv_vectorized; #endif } operator_PRECISION_struct; @@ -87,7 +91,7 @@ operator_PRECISION_struct op; vector_PRECISION buf1, buf2, buf3, buf4, buf5; vector_PRECISION oe_buf[4]; - vector_PRECISION local_minres_buffer[3]; + buffer_PRECISION local_minres_buffer[3]; int block_oe_offset, *index[4], dir_length[4], num_blocks, num_colors, dir_length_even[4], dir_length_odd[4], *oe_index[4], num_block_even_sites, num_block_odd_sites, num_aggregates, diff --git a/src/oddeven_generic.c b/src/oddeven_generic.c index 9da8cce..32d98f4 100644 --- a/src/oddeven_generic.c +++ b/src/oddeven_generic.c @@ -191,250 +191,253 @@ void selfcoupling_LU_doublet_decomposition_PRECISION( const config_PRECISION out #endif -static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) { +static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION L, + int start, int end ) { /********************************************************************************* * Solves L*(L^H)*x = b for x, i.e., the clover coupling for a single lattice * site. -* - vector_PRECISION b: Right hand side. -* - vector_PRECISION x: Solution. +* - vector_PRECISION *b: Right hand side. +* - vector_PRECISION *x: Solution. * - config_PRECISION L: Cholesky factor ( lower triangular matrix ) *********************************************************************************/ - register int i, j; + register int id, i, j; int n; + buffer_PRECISION x_pt = x->vector_buffer, b_pt = b->vector_buffer; + x_pt += start; b_pt += start; - for ( n=0; n<2; n++ ) { - // forward substitution with L - for ( i=0; i<6; i++ ) { - x[i] = b[i]; - for ( j=0; j=0; i-- ) { - for ( j=i+1; j<6; j++ ) { - x[i] = x[i] - conj_PRECISION(L[(j*(j+1))/2 + i]) * x[j]; + L -= 21; + // backward substitution with L^H + for ( i=5; i>=0; i-- ) { + for ( j=i+1; j<6; j++ ) { + x_pt[i] = x_pt[i] - conj_PRECISION(L[(j*(j+1))/2 + i]) * x_pt[j]; + } + x_pt[i] = x_pt[i] / conj_PRECISION(L[(i*(i+1))/2 + i]); } - x[i] = x[i] / conj_PRECISION(L[(i*(i+1))/2 + i]); + x_pt+=6; + b_pt+=6; + L+=21; } - x+=6; - b+=6; - L+=21; + x_pt+=12; b_pt+=12; L+=42; } } -static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION LU ) { + +static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION LU, + int start, int end ) { /********************************************************************************* * Solves L*U*x = b for x, i.e., the clover coupling for a single lattice * site. -* - vector_PRECISION b: Right hand side. -* - vector_PRECISION x: Solution. +* - vector_PRECISION *b: Right hand side. +* - vector_PRECISION *x: Solution. * - config_PRECISION L: Lower matrix from modified LU decomposition * Note: U is given by u_{ii}=1, u_{ij}=l_{ji}* / l_{ii} *********************************************************************************/ - register int i, j, n; + register int id, i, j, n; + buffer_PRECISION x_pt = x->vector_buffer, b_pt = b->vector_buffer; + x_pt += start; b_pt += start; #ifdef HAVE_TM1p1 - if( g.n_flavours == 2) - for ( n=0; n<2; n++ ) { - // solve x = U^(-1) L^(-1) b - // forward substitution with L - for ( i=0; i<12; i++ ) { - x[i] = b[i]; - for ( j=0; j=0; i-- ) { - for ( j=i+1; j<12; j++ ) { - x[i] = x[i] - LU[i*12+j]*x[j]; + if( g.n_flavours == 2) { + LU += (start/24)*288; + for ( id=start; id=0; i-- ) { + for ( j=i+1; j<12; j++ ) { + x_pt[i] = x_pt[i] - LU[i*12+j]*x_pt[j]; + } + x_pt[i] = x_pt[i]/LU[i*(12+1)]; } + x_pt+=12; + b_pt+=12; + LU+=12*12; } - // backward substitution with U - for ( i=6-1; i>=0; i-- ) { - for ( j=i+1; j<6; j++ ) { - x[i] = x[i] - LU[i*6+j]*x[j]; + x_pt+=24; b_pt+=24; LU+=288; + } + } else +#endif + { + LU += (start/12)*72; + for ( id=start; id=0; i-- ) { + for ( j=i+1; j<6; j++ ) { + x_pt[i] = x_pt[i] - LU[i*6+j]*x_pt[j]; + } + x_pt[i] = x_pt[i]/LU[i*(6+1)]; + } + x_pt+=6; + b_pt+=6; + LU+=6*6; } - x[i] = x[i]/LU[i*(6+1)]; } - x+=6; - b+=6; - LU+=6*6; + x_pt+=12; b_pt+=12; LU+=72; } } - -static inline void LLH_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION L ) { +static inline void LLH_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION L, + int start, int end ) { /********************************************************************************* * Applies the clover coupling term to a vector, by multiplying L^H * and then L. -* - vector_PRECISION x: Input vector. -* - vector_PRECISION y: Output vector. +* - vector_PRECISION *x: Input vector. +* - vector_PRECISION *y: Output vector. * - config_PRECISION L: Cholesky factor ( lower triangular matrix ) *********************************************************************************/ - register int i, j; + register int id, i, j; int n; complex_PRECISION z[6]; + buffer_PRECISION x_pt = x->vector_buffer, y_pt = y->vector_buffer; + x_pt += start; y_pt += start; - for ( n=0; n<2; n++ ) { - // z = L^H x - for ( j=0; j<6; j++ ) { // columns - for ( i=0; ivector_buffer, y_pt = y->vector_buffer; + x_pt += start; y_pt += start; #ifdef HAVE_TM1p1 - if( g.n_flavours == 2) - for ( n=0; n<2; n++ ) { - for ( i=0; i<12; i++ ) { - y[i] = LU[i*(12+1)]*x[i]; - for ( j=i+1; j<12; j++ ) - y[i] += LU[i*12+j]*x[j]; + if( g.n_flavours == 2) { + LU += (start/24)*288; + for ( id=start; id0; i-- ) + for ( j=0; j0; i-- ) - for ( j=0; j0; i-- ) + for ( j=0; j0; i-- ) - for ( j=0; jclover_doublet_vectorized + (start/24)*288; - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; - for ( int i=start; iepsbar_term+(start/24)*12; - if ( g.n_flavours == 2 && - ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) ) - apply_doublet_coupling_PRECISION( x, y, epsbar_term, end-start ); -#else - config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288; - // diagonal blocks applied to the even sites - for ( int i=start; iclover_doublet_oo_inv, start, end); } else { #endif - x += start; y += start; if ( g.csw ) { -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_vectorized + (start/12)*144; - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; - for ( int i=start; iclover + (start/12)*72; - // diagonal blocks applied to the even sites - for ( int i=start; iclover, start, end); #else - config_PRECISION sc = op->clover + (start/12)*42; - // diagonal blocks applied to the even sites - for ( int i=start; iclover, start, end ); #endif } else { config_PRECISION sc = op->clover + start; - for ( int i=start; ivector_buffer[i] = x->vector_buffer[i]*sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -443,38 +446,29 @@ void diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI } // for debugging only -void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void diag_ee_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { - int i, n1 = op->num_even_sites; - config_PRECISION sc = op->clover_doublet_oo_inv; - // diagonal blocks applied to the even sites - for ( i=0; inum_even_sites; + LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover_doublet_oo_inv, 0, n1*24); } else { #endif int i, n1 = op->num_even_sites; - config_PRECISION sc = op->clover; if ( g.csw ) { // diagonal blocks applied to the even sites - for ( i=0; iclover, 0, n1*12 ); #else - LU_perform_fwd_bwd_subs_PRECISION( y, x, sc ); - y+=12; x+=12; sc+=72; + LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, 0, n1*12 ); #endif - } } else { - for ( i=0; iclover; + for ( i=0; ivector_buffer[i] = x->vector_buffer[i]/sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -484,51 +478,35 @@ void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRE } // for debugging only -void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Applies the odd-odd block of the odd even decomposition to a vector. -* - vector_PRECISION x: Input vector. -* - vector_PRECISION y: Output vector. +* - vector_PRECISION *x: Input vector. +* - vector_PRECISION *y: Output vector. *********************************************************************************/ START_UNTHREADED_FUNCTION(threading) #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { - int i, n1 = op->num_even_sites, n2 = op->num_odd_sites; - config_PRECISION sc = op->clover_doublet_oo_inv + n1*288; - x += n1*24; y += n1*24; - // diagonal blocks applied to the even sites - for ( i=0; inum_even_sites, n2 = op->num_odd_sites; + LU_multiply_PRECISION( y, x, op->clover_doublet_oo_inv, n1*24, (n1+n2)*24 ); } else { #endif int i, n1 = op->num_even_sites, n2 = op->num_odd_sites; - config_PRECISION sc = op->clover; - x += n1*12; y += n1*12; // diagonal blocks applied to the odd sites if ( g.csw ) { #ifndef HAVE_TM - sc += n1*42; - for ( i=0; iclover, n1*12, (n1+n2)*12 ); #else - sc += n1*72; - for ( i=0; iclover, n1*12, (n1+n2)*12 ); #endif } else { - sc += n1*12; - for ( i=0; iclover + n1*12; + for ( i=n1*12; i<(n1+n2)*12; ) { + FOR12( y->vector_buffer[i] = x->vector_buffer[i]*sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -539,59 +517,26 @@ void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI } -void diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, +void diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, int start, int end ) { #ifdef HAVE_TM1p1 if( g.n_flavours == 2) { - x += start; y += start; // inverse diagonal blocks applied to the odd sites -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288; - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; - for ( int i=start; iclover_doublet_oo_inv + (start/24)*288; - for ( int i=start; iclover_doublet_oo_inv, start, end ); } else { #endif - config_PRECISION sc = op->clover; - x += start; y += start; // inverse diagonal blocks applied to the odd sites if ( g.csw ) { -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); - PRECISION *x_pt = (PRECISION*)x; - PRECISION *y_pt = (PRECISION*)y; - for ( int i=start; iclover, start, end ); #else - sc += (start/12)*42; - for ( int i=start; iclover, start, end ); #endif } else { - sc += start; - for ( int i=start; iclover + start; + for ( int i=start; ivector_buffer[i] = x->vector_buffer[i]/sc[i]; i++;) } } #ifdef HAVE_TM1p1 @@ -655,31 +600,12 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { MALLOC( op->clover, complex_PRECISION, lu_dec_size*n ); Aee = op->clover; Aoo = op->clover + op->num_even_sites*lu_dec_size; - /* TODO: fix the vectorized part -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - MALLOC_HUGEPAGES( op->clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36, 4*SIMD_LENGTH_PRECISION ); - PRECISION *Aee_vectorized = op->clover_vectorized; - PRECISION *Aoo_vectorized = op->clover_vectorized + op->num_even_sites*2*2*36; -#endif - */ for ( t=0; tclover_doublet_oo_inv, complex_PRECISION, lu_doublet_dec_size*n ); Aee = op->clover_doublet_oo_inv; Aoo = op->clover_doublet_oo_inv + op->num_even_sites*lu_doublet_dec_size; - /* -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - MALLOC_HUGEPAGES( op->clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36, 4*SIMD_LENGTH_PRECISION ); - MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, op->num_odd_sites*2*2*144, 4*SIMD_LENGTH_PRECISION ); - PRECISION *Aee_vectorized = op->clover_doublet_vectorized; - PRECISION *Aoo_vectorized = op->clover_doublet_vectorized + op->num_even_sites*288; - PRECISION *Aoo_inverse_vectorized = op->clover_doublet_oo_inv_vectorized; -#endif - */ for ( t=0; tD_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); - MALLOC_HUGEPAGES( op->D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); - for ( int i=0; inum_inner_lattice_sites; i++ ) { - PRECISION *D_vectorized = op->D_vectorized + 96*i; - PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i; - complex_PRECISION *D_out_pt = op->D + 36*i; - for ( int mu=0; mu<4; mu++ ) { - set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu ); - } - } -#endif - // define data layout MALLOC( op->index_table, int, N[T]*N[Z]*N[Y]*N[X] ); eot = op->index_table; @@ -922,15 +792,15 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { MALLOC( op->prnT, complex_PRECISION, j*8 ); op->prnZ = op->prnT + j; op->prnY = op->prnZ + j; op->prnX = op->prnY + j; op->prpT = op->prnX + j; op->prpZ = op->prpT + j; op->prpY = op->prpZ + j; op->prpX = op->prpY + j; - MALLOC( op->buffer, complex_PRECISION*, 2 ); - op->buffer[0] = NULL; + MALLOC( op->buffer, vector_PRECISION, 2 ); + for(int i=0; i<2; i++ ){ + vector_PRECISION_init( &(op->buffer[i]) ); #ifdef HAVE_TM1p1 - MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size ); - op->buffer[1] = op->buffer[0] + 2*l->vector_size; + vector_PRECISION_alloc( &(op->buffer[i]), _ORDINARY, 2, l, no_threading ); #else - MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); - op->buffer[1] = op->buffer[0] + l->vector_size; + vector_PRECISION_alloc( &(op->buffer[i]), _ORDINARY, 1, l, no_threading ); #endif + } ghost_alloc_PRECISION( 0, &(op->c), l ); ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ; l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var; @@ -945,18 +815,6 @@ void oddeven_free_PRECISION( level_struct *l ) { lu_dec_size = 72; #endif -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, PRECISION, 2*4*l->inner_vector_size ); - FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size ); -#endif -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36 ); -#ifdef HAVE_TM1p1 - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36 ); - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_oo_inv_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*144 ); -#endif -#endif - ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l ); FREE( l->oe_op_PRECISION.D, complex_PRECISION, 4*nc_size*n ); if ( g.csw ) @@ -977,13 +835,15 @@ void oddeven_free_PRECISION( level_struct *l ) { FREE( l->oe_op_PRECISION.c.boundary_table[2*mu], int, bs ); l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL; } - + + for(int i=0; i<2; i++ ){ #ifdef HAVE_TM1p1 - FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 4*l->vector_size ); + vector_PRECISION_free( &(l->oe_op_PRECISION.buffer[i]), l, no_threading ); #else - FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size ); + vector_PRECISION_free( &(l->oe_op_PRECISION.buffer[i]), l, no_threading ); #endif - FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 ); + } + FREE( l->oe_op_PRECISION.buffer, vector_PRECISION, 2 ); #ifdef HAVE_TM1p1 FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, 2*(l->num_lattice_site_var/2)*l->num_lattice_sites*8 ); FREE( l->oe_op_PRECISION.clover_doublet_oo_inv, complex_PRECISION, 288*n ); @@ -993,7 +853,7 @@ void oddeven_free_PRECISION( level_struct *l ) { } -void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void oddeven_to_serial_PRECISION( vector_double *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Translates a vector from an odd even PRECISION precision layout to a serial @@ -1011,7 +871,7 @@ void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_ for ( i=start; ivector_buffer[i*nsv+j] = (complex_double) in->vector_buffer[k*nsv+j]; } } END_NO_HYPERTHREADS(threading) @@ -1019,7 +879,7 @@ void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_ } -void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_struct *l, struct Thread *threading ) { +void serial_to_oddeven_PRECISION( vector_PRECISION *out, vector_double *in, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Translates a vector from a serial double precision layout to an odd even @@ -1037,7 +897,7 @@ void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_ for ( i=start; ivector_buffer[k*nsv+j] = (complex_PRECISION) in->vector_buffer[i*nsv+j]; } } END_NO_HYPERTHREADS(threading) @@ -1045,7 +905,7 @@ void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_ } -void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void oddeven_to_block_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { int i, j, k, m, nsv = l->num_lattice_site_var, *tt_oe = l->oe_op_PRECISION.translation_table, @@ -1059,7 +919,7 @@ void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, leve for ( i=start; ivector_buffer[m*nsv+j] = in->vector_buffer[k*nsv+j]; } } END_NO_HYPERTHREADS(threading) @@ -1067,7 +927,7 @@ void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, leve } -void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) { +void block_to_oddeven_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) { int i, j, k, m, nsv = l->num_lattice_site_var, *tt_oe = l->oe_op_PRECISION.translation_table, @@ -1081,14 +941,14 @@ void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, leve for ( i=start; ivector_buffer[k*nsv+j] = in->vector_buffer[m*nsv+j]; } } END_NO_HYPERTHREADS(threading) SYNC_CORES(threading) } -void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, +void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites, @@ -1115,29 +975,20 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato plus_dir_param = _ODD_SITES; } -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; -#else int i, *nb_pt; - vector_PRECISION phi_pt, eta_pt, end_pt; + buffer_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; -#endif #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { // project in negative directions -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprp_PRECISION( prn, phi, 24*start, 24*n ); -#else complex_PRECISION pbuf[12]; - for ( i=12*start, phi_pt=phi+24*start; i<12*n; i+=12, phi_pt+=24 ) { + for ( i=12*start, phi_pt=phi->vector_buffer+24*start; i<12*n; i+=12, phi_pt+=24 ) { dprp_T_PRECISION( op->prnT+i, phi_pt ); dprp_Z_PRECISION( op->prnZ+i, phi_pt ); dprp_Y_PRECISION( op->prnY+i, phi_pt ); dprp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); @@ -1146,10 +997,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dprn_su3_PRECISION( prp, phi, op, neighbor, 24*start, 24*n ); -#else - for ( phi_pt=phi+24*start, end_pt=phi+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptvector_buffer+24*start, end_pt=phi->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpX+i+6, D_pt, pbuf+6 ); mvmh_PRECISION( op->prpX+i+9, D_pt, pbuf+9 ); D_pt += 9; } -#endif if ( amount == _EVEN_SITES ) { start = start_even, n = end_even; } else if ( amount == _ODD_SITES ) { @@ -1198,10 +1045,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_dpbp_PRECISION( eta, prn, op, neighbor, 24*start, 24*n ); -#else - for ( eta_pt=eta+24*start, end_pt=eta+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptvector_buffer+24*start, end_pt=eta->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); @@ -1231,7 +1075,6 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato mvm_PRECISION( pbuf+9, D_pt, op->prnX+i+9 ); dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); @@ -1240,30 +1083,22 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - dpbn_PRECISION( eta, prp, 24*start, 24*n ); -#else - for ( i=12*start, eta_pt=eta+24*start; i<12*n; i+=12, eta_pt+=24 ) { + for ( i=12*start, eta_pt=eta->vector_buffer+24*start; i<12*n; i+=12, eta_pt+=24 ) { dpbn_su3_T_PRECISION( op->prpT+i, eta_pt ); dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif } else { #endif // project in negative directions -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prp_PRECISION( prn, phi, 12*start, 12*n ); -#else complex_PRECISION pbuf[6]; - for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) { + for ( i=6*start, phi_pt=phi->vector_buffer+12*start; i<6*n; i+=6, phi_pt+=12 ) { prp_T_PRECISION( op->prnT+i, phi_pt ); prp_Z_PRECISION( op->prnZ+i, phi_pt ); prp_Y_PRECISION( op->prnY+i, phi_pt ); prp_X_PRECISION( op->prnX+i, phi_pt ); } -#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); @@ -1272,10 +1107,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n ); -#else - for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptvector_buffer+12*start, end_pt=phi->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpX+i, D_pt, pbuf ); mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9; } -#endif if ( amount == _EVEN_SITES ) { start = start_even, n = end_even; } else if ( amount == _ODD_SITES ) { @@ -1316,10 +1147,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_pbp_PRECISION( eta, prn, op, neighbor, 12*start, 12*n ); -#else - for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptvector_buffer+12*start, end_pt=eta->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); @@ -1341,7 +1169,6 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 ); pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } -#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); @@ -1350,16 +1177,12 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); END_LOCKED_MASTER(threading) // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta, prp, 12*start, 12*n ); -#else - for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) { + for ( i=6*start, eta_pt=eta->vector_buffer+12*start; i<6*n; i+=6, eta_pt+=12 ) { pbn_su3_T_PRECISION( op->prpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } -#endif #ifdef HAVE_TM1p1 } #endif @@ -1367,7 +1190,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato SYNC_CORES(threading) } -void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, +void apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { /********************************************************************************* @@ -1383,8 +1206,8 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in vector_PRECISION *tmp = op->buffer; SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); - vector_PRECISION_define( tmp[0], 0, start_even, end_even, l ); + vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp[0], 0, start_even, end_even, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); @@ -1392,17 +1215,17 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in diag_ee_PRECISION( out, in, op, l, start_even, end_even ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 1, threading ); - hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, start_odd, end_odd ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); - vector_PRECISION_minus( out, out, tmp[0], start_even, end_even, l ); + vector_PRECISION_minus( out, out, &tmp[0], start_even, end_even, l ); } @@ -1417,80 +1240,68 @@ void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_stru // odd to even PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp, p->b, op, l, start, end ); + diag_oo_inv_PRECISION( &tmp, &(p->b), op, l, start, end ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) - vector_PRECISION_scale( tmp, tmp, -1, start, end, l ); + vector_PRECISION_scale( &tmp, &tmp, -1, start, end, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( p->b, tmp, op, _EVEN_SITES, l, threading ); + hopping_term_PRECISION( &(p->b), &tmp, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); if ( g.method == 4 ) fgmres_PRECISION( p, l, threading ); else if ( g.method == 5 ) bicgstab_PRECISION( p, l, threading ); - diag_oo_inv_PRECISION( p->x, p->b, op, l, start, end ); + diag_oo_inv_PRECISION( &(p->x), &(p->b), op, l, start, end ); // even to odd SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start, end, l ); + vector_PRECISION_define( &tmp, 0, start, end, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp, &(p->x), op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( p->b, tmp, op, l, start, end ); + diag_oo_inv_PRECISION( &(p->b), &tmp, op, l, start, end ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) - vector_PRECISION_minus( p->x, p->x, p->b, start, end, l ); + vector_PRECISION_minus( &(p->x), &(p->x), &(p->b), start, end, l ); SYNC_CORES(threading) } -void g5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { - if ( eta != phi ) { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - FOR6( *eta = -(*phi); phi++; eta++; ) - FOR6( *eta = (*phi); phi++; eta++; ) +void g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { + if ( eta->vector_buffer != phi->vector_buffer ) { + for( int i = start; i < end; ) { + FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) + FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; ) } } else { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - FOR6( *eta = -(*phi); phi++; eta++; ) - eta+=6; phi+=6; + for ( int i = start; i < end; ) { + FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; ) + i+=6; } } } -void minus_g5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { - if ( eta != phi ) { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - FOR6( *eta = (*phi); phi++; eta++; ) - FOR6( *eta = -(*phi); phi++; eta++; ) +void minus_g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) { + if ( eta->vector_buffer != phi->vector_buffer ) { + for ( int i = start; i < end; ) { + FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; ) + FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) } } else { - vector_PRECISION eta_end = eta + end; - eta += start; - phi += start; - while ( eta < eta_end ) { - eta+=6; phi+=6; - FOR6( *eta = -(*phi); phi++; eta++; ) + for ( int i = start; i < end; ) { + i+=6; + FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; ) } } } -void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void g5D_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Applies the Schur complement to a vector. @@ -1502,10 +1313,14 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var ); vector_PRECISION *tmp = op->buffer; + + // vector_PRECISION **tmp; + // *tmp->vector_buffer = op->buffer->vector_buffer; + SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); - vector_PRECISION_define( tmp[0], 0, start_even, end_even, l ); + vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp[0], 0, start_even, end_even, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); @@ -1513,17 +1328,17 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO diag_ee_PRECISION( out, in, op, l, start_even, end_even ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 1, threading ); - hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, start_odd, end_odd ); SYNC_CORES(threading) PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading ); + hopping_term_PRECISION( &tmp[0], &tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); - vector_PRECISION_minus( out, out, tmp[0], start_even, end_even, l ); + vector_PRECISION_minus( out, out, &tmp[0], start_even, end_even, l ); SYNC_CORES(threading) g5_PRECISION( out, out, start_even, end_even, l ); // g5_PRECISION( out, out, start_odd, end_odd, l ); @@ -1541,40 +1356,40 @@ void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_ // odd to even PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp, p->b, op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &tmp, &(p->b), op, l, start_odd, end_odd ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) // g5_PRECISION( tmp, tmp, start_odd, end_odd, l ); // vector_PRECISION_scale( tmp, tmp, -1, start_odd, end_odd, l ); - minus_g5_PRECISION( tmp, tmp, start_odd, end_odd, l ); + minus_g5_PRECISION( &tmp, &tmp, start_odd, end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - vector_PRECISION_define( p->x, 0, start_even, end_even, l ); - hopping_term_PRECISION( p->x, tmp, op, _EVEN_SITES, l, threading ); + vector_PRECISION_define( &(p->x), 0, start_even, end_even, l ); + hopping_term_PRECISION( &(p->x), &tmp, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); SYNC_CORES(threading) - g5_PRECISION( p->x, p->x, start_even, end_even, l ); - vector_PRECISION_plus( p->b, p->b, p->x, start_even, end_even, l ); + g5_PRECISION( &(p->x), &(p->x), start_even, end_even, l ); + vector_PRECISION_plus( &(p->b), &(p->b), &(p->x), start_even, end_even, l ); SYNC_CORES(threading) ASSERT( g.method == 6 ); fgmres_PRECISION( p, l, threading ); - diag_oo_inv_PRECISION( p->x, p->b, op, l, start_odd, end_odd ); - g5_PRECISION( p->x, p->x, start_odd, end_odd, l ); + diag_oo_inv_PRECISION( &(p->x), &(p->b), op, l, start_odd, end_odd ); + g5_PRECISION( &(p->x), &(p->x), start_odd, end_odd, l ); // even to odd SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start_odd, end_odd, l ); + vector_PRECISION_define( &tmp, 0, start_odd, end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + hopping_term_PRECISION( &tmp, &(p->x), op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( p->b, tmp, op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( &(p->b), &tmp, op, l, start_odd, end_odd ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) - vector_PRECISION_minus( p->x, p->x, p->b, start_odd, end_odd, l ); + vector_PRECISION_minus( &(p->x), &(p->x), &(p->b), start_odd, end_odd, l ); SYNC_CORES(threading) } @@ -1596,14 +1411,9 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct } if ( g.csw ) { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION config_PRECISION clover_pt = op->clover, clover_oo_inv_pt = op->clover_oo_inv; complex_double buffer[42]; int cs = 42; -#else - PRECISION *clover_pt = op->clover_vectorized, *clover_oo_inv_pt = op->clover_oo_inv_vectorized; - int cs = 144; -#endif for ( d0=0; d0clover_doublet_oo_inv, clover_pt = op->clover; int cs = g.csw ? 42:12; -#else - PRECISION *clover_pt = g.csw ? op->clover_doublet_vectorized:(PRECISION*)op->clover, *clover_oo_inv_pt = op->clover_doublet_oo_inv_vectorized; - int cs = g.csw ? 288:24; -#endif config_PRECISION eps_term_pt = op->epsbar_term; #ifdef HAVE_TM tm_term_pt = op->tm_term; @@ -1688,8 +1486,6 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) { if (((t-d1*block_size[T])+(z-c1*block_size[Z])+ (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) { - -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION if ( g.csw ) { for( i=0; i<12; i++ ) //0-23 buffer[i+12] = buffer[i] = (complex_double) clover_pt[i]; @@ -1715,48 +1511,13 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct clover_pt += cs; selfcoupling_LU_doublet_decomposition_PRECISION( clover_oo_inv_pt, buffer ); clover_oo_inv_pt += 288; -#else - if ( g.csw ) { - sse_site_clover_doublet_invert_PRECISION( clover_pt, eps_term_pt, clover_oo_inv_pt ); - } else { -#ifdef HAVE_TM - for ( i=0; i<6; i++ ) { //we temporaly save in clover_oo_inv_pt - clover_oo_inv_pt[2*i] = clover_pt[2*i] + creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+1] = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+12] = clover_pt[2*i] - creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); - } - for ( i=6; i<12; i++ ) { - clover_oo_inv_pt[2*i+12] = clover_pt[2*i] + creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+24] = clover_pt[2*i] - creal_PRECISION(tm_term_pt[i]); - clover_oo_inv_pt[2*i+25] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); - } - tm_term_pt += 12; -#else - for ( i=0; i<6; i++ ) { - clover_oo_inv_pt[2*i+12] = clover_oo_inv_pt[2*i] = clover_pt[2*i]; - clover_oo_inv_pt[2*i+13] = clover_oo_inv_pt[2*i+1] = clover_pt[2*i+1]; - } - for ( i=6; i<12; i++ ) { - clover_oo_inv_pt[2*i+24] = clover_oo_inv_pt[2*i+12] = clover_pt[2*i]; - clover_oo_inv_pt[2*i+25] = clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1]; - } -#endif - sse_site_clover_doublet_invert_PRECISION( clover_oo_inv_pt, eps_term_pt, clover_oo_inv_pt ); - } - - clover_pt += cs; - eps_term_pt += 12; - clover_oo_inv_pt += 2*288; -#endif } } } #endif } -void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_diag_ee_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1767,69 +1528,51 @@ void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } // diagonal blocks applied to the odd sites of a block -void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - //we don't have the LU decomposition here, for debugging only - int n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites, nv = l->num_lattice_site_var; - clover_PRECISION( eta, phi, &(s->op), start+nv*n1, start+nv*(n1+n2), l, threading ); - -#else - int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { int block_num = start/24/(n1+n2); // config_PRECISION clover = s->op.clover_doublet_oo_inv+n1*288+(start/24)*288; - config_PRECISION clover = s->op.clover_doublet_oo_inv+(start/24-block_num*n1)*288; - vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start; - for ( i=0; iop.clover_doublet_oo_inv-(block_num+1)*n1*288; + LU_multiply_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start ); } else { #endif - vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start; if ( g.csw ) { int block_num = start/12/(n1+n2); #ifndef HAVE_TM - config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*42; + LLH_multiply_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #else - config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*72; + LU_multiply_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #endif } else { + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+n1*12+start; + leta.vector_buffer = eta->vector_buffer+n1*12+start; config_PRECISION clover = s->op.clover+n1*12+start; #ifndef HAVE_TM for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]*(clover[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]*(clover[i]); #else config_PRECISION tm_term = s->op.tm_term+n1*12+start; for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]*(clover[i]+tm_term[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]*(clover[i]+tm_term[i]); #endif } #ifdef HAVE_TM1p1 } #endif - -#endif END_UNTHREADED_FUNCTION(threading) } // inverted diagonal blocks applied to the odd sites of a block -void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, +void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1839,58 +1582,33 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, in #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { - vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start; int block_num = start/24/(n1+n2); -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - config_PRECISION clover = s->op.clover_doublet_oo_inv + (start/24-block_num*n1)*288; - for ( i=0; iop.clover_doublet_oo_inv_vectorized + (start/24-block_num*n1)*2*288; - for ( i=0; iop.clover_doublet_oo_inv-(block_num+1)*n1*288; + LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start ); } else { #endif - vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start; if ( g.csw ) { int block_num = start/12/(n1+n2); -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION #ifndef HAVE_TM - config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42; - for ( i=0; iop.clover_oo_inv+(start/12-block_num*n1)*72; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*42; + LLH_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #else - PRECISION *clover_vectorized = s->op.clover_oo_inv_vectorized + (start/12-block_num*n1)*144; - for ( i=0; iop.clover_oo_inv-(block_num+1)*n1*72; + LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start ); #endif } else { config_PRECISION clover = s->op.clover+n1*12+start; + vector_PRECISION lphi, leta; + lphi.vector_buffer = phi->vector_buffer+n1*12+start; + leta.vector_buffer = eta->vector_buffer+n1*12+start; #ifndef HAVE_TM for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]/(clover[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]/(clover[i]); #else config_PRECISION tm_term = s->op.tm_term+n1*12+start; for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]/(clover[i]+tm_term[i]); + leta.vector_buffer[i] = lphi.vector_buffer[i]/(clover[i]+tm_term[i]); #endif } #ifdef HAVE_TM1p1 @@ -1901,7 +1619,7 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, in } -void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -1909,30 +1627,10 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd, **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; - - for ( int mu=0; mu<4; mu++ ) { - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[mu]; - a2 = n1; n2 = a2 + length_odd[mu]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[mu]; n1 = a1 + length_odd[mu]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[mu]+length_odd[mu]; - a2 = 0; n2 = n1; - } - block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor ); - } - -#else config_PRECISION D = s->op.D + (start/nv)*36; int i, j, k, *ind; config_PRECISION D_pt; - vector_PRECISION lphi = phi+start, leta = eta+start; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; #ifdef HAVE_TM1p1 if ( g.n_flavours == 2 ) { @@ -2188,13 +1886,12 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, } #ifdef HAVE_TM1p1 } -#endif #endif END_UNTHREADED_FUNCTION(threading) } -void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, +void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -2202,28 +1899,8 @@ void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd, **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; - - for ( int mu=0; mu<4; mu++ ) { - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[mu]; - a2 = n1; n2 = a2 + length_odd[mu]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[mu]; n1 = a1 + length_odd[mu]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[mu]+length_odd[mu]; - a2 = 0; n2 = n1; - } - block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor ); - block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor ); - } - -#else int i, j, k, *ind; - vector_PRECISION lphi = phi+start, leta = eta+start; + buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start; config_PRECISION D_pt, D = s->op.D + (start/nv)*36; #ifdef HAVE_TM1p1 @@ -2482,27 +2159,26 @@ void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, #ifdef HAVE_TM1p1 } #endif -#endif END_UNTHREADED_FUNCTION(threading) } -void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start, +void apply_block_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { vector_PRECISION *tmp = s->oe_buf; block_diag_ee_PRECISION( out, in, start, s, l, threading ); START_LOCKED_MASTER(threading) - vector_PRECISION_define( tmp[0], 0, start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l ); + vector_PRECISION_define( &tmp[0], 0, start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l ); END_LOCKED_MASTER(threading) - block_hopping_term_PRECISION( tmp[0], in, start, _ODD_SITES, s, l, threading ); - block_diag_oo_inv_PRECISION( tmp[1], tmp[0], start, s, l, threading ); - block_n_hopping_term_PRECISION( out, tmp[1], start, _EVEN_SITES, s, l, threading ); + block_hopping_term_PRECISION( &tmp[0], in, start, _ODD_SITES, s, l, threading ); + block_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], start, s, l, threading ); + block_n_hopping_term_PRECISION( out, &tmp[1], start, _EVEN_SITES, s, l, threading ); } -void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, vector_PRECISION latest_iter, +void block_solve_oddeven_PRECISION( vector_PRECISION *phi, vector_PRECISION *r, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) @@ -2511,21 +2187,19 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve int end = start+s->block_vector_size; // odd to even - vector_PRECISION_copy( tmp[3], r, start, end, l ); - block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading ); - block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _EVEN_SITES, s, l, no_threading ); - - local_minres_PRECISION( NULL, tmp[3], tmp[2], start, s, l, no_threading ); - + vector_PRECISION_copy( &tmp[3], r, start, end, l ); + block_diag_oo_inv_PRECISION( &tmp[2], &tmp[3], start, s, l, no_threading ); + block_n_hopping_term_PRECISION( &tmp[3], &tmp[2], start, _EVEN_SITES, s, l, no_threading ); + local_minres_PRECISION( NULL, &tmp[3], &tmp[2], start, s, l, no_threading ); // even to odd - block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _ODD_SITES, s, l, no_threading ); - block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading ); + block_n_hopping_term_PRECISION( &tmp[3], &tmp[2], start, _ODD_SITES, s, l, no_threading ); + block_diag_oo_inv_PRECISION( &tmp[2], &tmp[3], start, s, l, no_threading ); // update phi, latest_iter - vector_PRECISION_copy( latest_iter, tmp[2], start, end, l ); - vector_PRECISION_plus( phi, phi, tmp[2], start, end, l ); + vector_PRECISION_copy( latest_iter, &tmp[2], start, end, l ); + vector_PRECISION_plus( phi, phi, &tmp[2], start, end, l ); // update r - vector_PRECISION_copy( r, tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l ); + vector_PRECISION_copy( r, &tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l ); vector_PRECISION_define( r, 0, start+l->num_lattice_site_var*s->num_block_even_sites, end, l ); END_UNTHREADED_FUNCTION(threading) @@ -2537,55 +2211,61 @@ void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) { schwarz_PRECISION_struct *s = &(l->s_PRECISION); - vector_PRECISION b1 = NULL, b2 = NULL, b3 = NULL, b4 = NULL, b5 = NULL; + vector_PRECISION b1, b2, b3, b4, b5; PRECISION diff; + + vector_PRECISION_init(&b1); + vector_PRECISION_init(&b2); + vector_PRECISION_init(&b3); + vector_PRECISION_init(&b4); + vector_PRECISION_init(&b5); int vs = s->block_vector_size * s->num_blocks; - MALLOC( b1, complex_PRECISION, vs ); - MALLOC( b2, complex_PRECISION, vs ); - MALLOC( b3, complex_PRECISION, vs ); - MALLOC( b4, complex_PRECISION, vs ); - MALLOC( b5, complex_PRECISION, vs ); + MALLOC( b1.vector_buffer, complex_PRECISION, vs ); + MALLOC( b2.vector_buffer, complex_PRECISION, vs ); + MALLOC( b3.vector_buffer, complex_PRECISION, vs ); + MALLOC( b4.vector_buffer, complex_PRECISION, vs ); + MALLOC( b5.vector_buffer, complex_PRECISION, vs ); - vector_PRECISION_define_random( b1, 0, vs, l ); + vector_PRECISION_define_random( &b1, 0, vs, l ); for (int i = 0; i< s->num_blocks; i++ ) { - block_diag_ee_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); - block_diag_oo_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); - block_hopping_term_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading ); + block_diag_ee_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_diag_oo_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_hopping_term_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading ); - block_d_plus_clover_PRECISION( b3, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_d_plus_clover_PRECISION( &b3, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); } - vector_PRECISION_minus( b3, b3, b2, 0, vs, l ); - diff = global_norm_PRECISION( b3, 0, vs, l, no_threading ) / global_norm_PRECISION( b2, 0, vs, l, no_threading ); + vector_PRECISION_minus( &b3, &b3, &b2, 0, vs, l ); + diff = global_norm_PRECISION( &b3, 0, vs, l, no_threading ) / global_norm_PRECISION( &b2, 0, vs, l, no_threading ); test0_PRECISION("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff ); - vector_PRECISION_copy( b4, b1, 0, s->block_vector_size, l ); - vector_PRECISION_define( b3, 0, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); + vector_PRECISION_copy( &b4, &b1, 0, s->block_vector_size, l ); + vector_PRECISION_define( &b3, 0, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); - block_hopping_term_PRECISION( b3, b4, 0, _ODD_SITES, s, l, no_threading ); - block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading ); - vector_PRECISION_plus( b4, b4, b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); + block_hopping_term_PRECISION( &b3, &b4, 0, _ODD_SITES, s, l, no_threading ); + block_diag_oo_inv_PRECISION( &b5, &b3, 0, s, l, no_threading ); + vector_PRECISION_plus( &b4, &b4, &b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); - apply_block_schur_complement_PRECISION( b3, b4, 0, s, l, no_threading ); - block_diag_oo_PRECISION( b3, b4, 0, s, l, no_threading ); + apply_block_schur_complement_PRECISION( &b3, &b4, 0, s, l, no_threading ); + block_diag_oo_PRECISION( &b3, &b4, 0, s, l, no_threading ); - block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading ); - block_hopping_term_PRECISION( b3, b5, 0, _EVEN_SITES, s, l, no_threading ); + block_diag_oo_inv_PRECISION( &b5, &b3, 0, s, l, no_threading ); + block_hopping_term_PRECISION( &b3, &b5, 0, _EVEN_SITES, s, l, no_threading ); - vector_PRECISION_minus( b3, b2, b3, 0, s->block_vector_size, l ); - diff = global_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading ); + vector_PRECISION_minus( &b3, &b2, &b3, 0, s->block_vector_size, l ); + diff = global_norm_PRECISION( &b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( &b2, 0, s->block_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff ); - FREE( b1, complex_PRECISION, vs ); - FREE( b2, complex_PRECISION, vs ); - FREE( b3, complex_PRECISION, vs ); - FREE( b4, complex_PRECISION, vs ); - FREE( b5, complex_PRECISION, vs ); + FREE( b1.vector_buffer, complex_PRECISION, vs ); + FREE( b2.vector_buffer, complex_PRECISION, vs ); + FREE( b3.vector_buffer, complex_PRECISION, vs ); + FREE( b4.vector_buffer, complex_PRECISION, vs ); + FREE( b5.vector_buffer, complex_PRECISION, vs ); END_UNTHREADED_FUNCTION(threading) } @@ -2600,74 +2280,72 @@ void oddeven_PRECISION_test( level_struct *l ) { * - Compare solutions ( Difference should be close to 0 ). *********************************************************************************/ - vector_double d1=NULL, d2=NULL, d3=NULL; - vector_PRECISION f1=NULL, f2=NULL, f3=NULL, f4=NULL, f5=NULL; + vector_double d[3]; + vector_PRECISION f[5]; double diff; - MALLOC( d1, complex_double, l->inner_vector_size ); - MALLOC( d2, complex_double, l->inner_vector_size ); - MALLOC( d3, complex_double, l->inner_vector_size ); - MALLOC( f1, complex_PRECISION, l->inner_vector_size ); - MALLOC( f2, complex_PRECISION, l->inner_vector_size ); - MALLOC( f3, complex_PRECISION, l->inner_vector_size ); - MALLOC( f4, complex_PRECISION, l->inner_vector_size ); - MALLOC( f5, complex_PRECISION, l->inner_vector_size ); - - vector_double_define_random( d1, 0, l->inner_vector_size, l ); - serial_to_oddeven_PRECISION( f1, d1, l, no_threading ); + for(int i=0; i<3; i++){ + vector_double_init( &d[i] ); + vector_double_alloc( &d[i], _INNER, 1, l, no_threading ); + } + + for(int i=0; i<5; i++){ + vector_PRECISION_init( &f[i] ); + vector_PRECISION_alloc( &f[i], _INNER, 1, l, no_threading ); + } + + vector_double_define_random( &d[0], 0, l->inner_vector_size, l ); + serial_to_oddeven_PRECISION( &f[0], &d[0], l, no_threading ); - diag_ee_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); - diag_oo_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, no_threading ); + diag_ee_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); + diag_oo_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), l, no_threading ); - hopping_term_PRECISION( f2, f1, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); + hopping_term_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading ); - d_plus_clover_double( d2, d1, &(g.op_double), l, no_threading ); - oddeven_to_serial_PRECISION( d1, f2, l, no_threading ); + d_plus_clover_double( &d[1], &d[0], &(g.op_double), l, no_threading ); + oddeven_to_serial_PRECISION( &d[0], &f[1], l, no_threading ); - vector_double_minus( d3, d1, d2, 0, l->num_inner_lattice_sites, l ); - diff = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading ); + vector_double_minus( &d[2], &d[0], &d[1], 0, l->num_inner_lattice_sites, l ); + diff = global_norm_double( &d[2], 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( &d[0], 0, l->num_inner_lattice_sites, l, no_threading ); test0_PRECISION("depth: %d, correctness of odd even layout: %le\n", l->depth, diff ); // -------------- - vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l ); - diag_oo_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading ); - diag_oo_inv_PRECISION( f4, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - vector_PRECISION_minus( f4, f4, f1, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &f[3], &f[0], 0, l->inner_vector_size, l ); + diag_oo_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading ); + diag_oo_inv_PRECISION( &f[3], &f[2], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + vector_PRECISION_minus( &f[3], &f[3], &f[0], 0, l->inner_vector_size, l ); - diff = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )); + diff = (PRECISION) (global_norm_PRECISION( &f[3], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f[0], 0, l->inner_vector_size, l, no_threading )); test0_PRECISION("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, diff ); // transformation part - vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l ); + vector_PRECISION_copy( &f[3], &f[0], 0, l->inner_vector_size, l ); // even to odd // set odd part of f3 to 0. - vector_PRECISION_define( f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_define( &f[2], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - hopping_term_PRECISION( f3, f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); - diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - vector_PRECISION_plus( f4, f4, f5, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + hopping_term_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); + diag_oo_inv_PRECISION( &f[4], &f[2], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + vector_PRECISION_plus( &f[3], &f[3], &f[4], l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); // block diagonal part - apply_schur_complement_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading ); - diag_oo_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading ); + apply_schur_complement_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading ); + diag_oo_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading ); // back transformation part - diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); - hopping_term_PRECISION( f3, f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + diag_oo_inv_PRECISION( &f[4], &f[3], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); + hopping_term_PRECISION( &f[2], &f[4], &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); - vector_PRECISION_minus( f1, f2, f3, 0, l->inner_vector_size, l ); - diff = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading )); + vector_PRECISION_minus( &f[0], &f[1], &f[2], 0, l->inner_vector_size, l ); + diff = (PRECISION) (global_norm_PRECISION( &f[0], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f[1], 0, l->inner_vector_size, l, no_threading )); test0_PRECISION("depth: %d, correctness of odd even schur complement: %le\n", l->depth, diff ); - - FREE( d1, complex_double, l->inner_vector_size ); - FREE( d2, complex_double, l->inner_vector_size ); - FREE( d3, complex_double, l->inner_vector_size ); - FREE( f1, complex_PRECISION, l->inner_vector_size ); - FREE( f2, complex_PRECISION, l->inner_vector_size ); - FREE( f3, complex_PRECISION, l->inner_vector_size ); - FREE( f4, complex_PRECISION, l->inner_vector_size ); - FREE( f5, complex_PRECISION, l->inner_vector_size ); + + for(int i=0; i<3; i++) + vector_double_free( &d[i], l, no_threading ); + + for(int i=0; i<5; i++) + vector_PRECISION_free( &f[i], l, no_threading ); } diff --git a/src/oddeven_generic.h b/src/oddeven_generic.h index 4fac101..2d03e98 100644 --- a/src/oddeven_generic.h +++ b/src/oddeven_generic.h @@ -24,39 +24,39 @@ struct Thread; - void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, + void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ); void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ); void oddeven_free_PRECISION( level_struct *l ); - void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_struct *l, struct Thread *threading ); - void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_struct *l, struct Thread *threading ); + void oddeven_to_serial_PRECISION( vector_double *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ); + void serial_to_oddeven_PRECISION( vector_PRECISION *out, vector_double *in, level_struct *l, struct Thread *threading ); - void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ); - void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ); + void oddeven_to_block_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ); + void block_to_oddeven_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ); - void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + void block_diag_ee_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void g5D_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l ); - void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start, + void apply_block_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, vector_PRECISION latest_iter, + void block_solve_oddeven_PRECISION( vector_PRECISION *phi, vector_PRECISION *r, vector_PRECISION *latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); void oddeven_PRECISION_test( level_struct *l ); diff --git a/src/operator_generic.c b/src/operator_generic.c index f881b23..bbda504 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -29,12 +29,8 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { op->backward_neighbor_table = NULL; op->translation_table = NULL; op->D = NULL; - op->D_vectorized = NULL; - op->D_transformed_vectorized = NULL; op->clover = NULL; op->clover_oo_inv = NULL; - op->clover_vectorized = NULL; - op->clover_oo_inv_vectorized = NULL; op->m0 = 0; #ifdef HAVE_TM op->mu = 0; @@ -49,8 +45,6 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { op->epsbar_ig5_odd_shift = 0; op->epsbar_term = NULL; op->clover_doublet_oo_inv = NULL; - op->clover_doublet_vectorized = NULL; - op->clover_doublet_oo_inv_vectorized = NULL; #endif for ( int mu=0; mu<4; mu++ ) @@ -71,7 +65,7 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op, // when used as preconditioner we usually do not need the projection buffers, unless // g.method >= 4: then oddeven_setup_float() is called in init.c, method_setup(). if ( l->depth == 0 ) { - int its = (l->num_lattice_site_var/2)*l->num_lattice_sites; + int its = (l->num_lattice_site_var/2)*l->num_lattice_sites*g.num_rhs_vect; #ifdef HAVE_TM1p1 its *= 2; #endif @@ -83,7 +77,7 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op, void operator_PRECISION_free_projection_buffers( operator_PRECISION_struct *op, level_struct *l ) { if ( l->depth == 0 ) { - int its = (l->num_lattice_site_var/2)*l->num_lattice_sites; + int its = (l->num_lattice_site_var/2)*l->num_lattice_sites*g.num_rhs_vect; #ifdef HAVE_TM1p1 its *= 2; #endif @@ -144,8 +138,6 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le MALLOC( op->translation_table, int, l->num_inner_lattice_sites ); if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - if( g.csw ) { #ifdef HAVE_TM //we use LU here MALLOC( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) ); @@ -155,15 +147,6 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le } #ifdef HAVE_TM1p1 MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) ); -#endif - -#else - if( g.csw ) - MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION ); -#ifdef HAVE_TM1p1 - MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION ); -#endif - #endif } @@ -224,8 +207,6 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev FREE( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); #endif if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) { -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION - if( g.csw ) { #ifdef HAVE_TM //we use LU here FREE( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) ); @@ -235,15 +216,6 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev } #ifdef HAVE_TM1p1 FREE( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) ); -#endif - -#else - if( g.csw ) - FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1) ); -#ifdef HAVE_TM1p1 - FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1) ); -#endif - #endif } @@ -338,45 +310,9 @@ void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_stru } void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l ) { - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - int i, n = 2*l->num_lattice_sites - l->num_inner_lattice_sites; - - for ( i=0; iD_vectorized + 96*i; - PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i; - complex_PRECISION *D_pt = op->D + 36*i; - for ( int mu=0; mu<4; mu++ ) - set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_pt+9*mu ); - } -#endif - } void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l ) { - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - int i, n = l->num_inner_lattice_sites; - - if ( g.csw != 0 ) - for ( i=0; iclover_vectorized + 144*i; - config_PRECISION clover_pt = op->clover + 42*i; - sse_set_clover_PRECISION( clover_vectorized_pt, clover_pt ); -#ifdef HAVE_TM1p1 - PRECISION *clover_doublet_vectorized_pt = op->clover_doublet_vectorized + 288*i; - sse_set_clover_doublet_PRECISION( clover_doublet_vectorized_pt, clover_pt ); -#endif -#ifdef HAVE_TM - config_PRECISION tm_term_pt = op->tm_term + 12*i; - sse_add_diagonal_clover_PRECISION( clover_vectorized_pt, tm_term_pt ); -#ifdef HAVE_TM1p1 - sse_add_diagonal_clover_doublet_PRECISION( clover_doublet_vectorized_pt, tm_term_pt ); -#endif -#endif - } -#endif - } void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { @@ -390,44 +326,54 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc * If enabled, also tests odd even preconditioning. *********************************************************************************/ - int ivs = l->inner_vector_size; - double diff; + int ivs = l->inner_vector_size, n_vect=g.num_rhs_vect; + double diff, diff1[n_vect], diff2[n_vect]; - vector_double vd1=NULL, vd2, vd3, vd4; - vector_PRECISION vp1=NULL, vp2; - - PUBLIC_MALLOC( vd1, complex_double, 4*ivs ); - PUBLIC_MALLOC( vp1, complex_PRECISION, 2*ivs ); + vector_double vd[4]; + vector_PRECISION vp[2]; - vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs; + for(int i=0; i<4; i++){ + vector_double_init( &vd[i] ); + vector_double_alloc( &vd[i], _INNER, n_vect, l, threading ); + } + + for(int i=0; i<2; i++){ + vector_PRECISION_init( &vp[i] ); + vector_PRECISION_alloc( &vp[i], _INNER, n_vect, l, threading ); + } START_LOCKED_MASTER(threading) - vector_double_define_random( vd1, 0, l->inner_vector_size, l ); - apply_operator_double( vd2, vd1, &(g.p), l, no_threading ); + //vector_double_define_random( &vd[0], 0, l->inner_vector_size, l ); + vector_double_define_random_new( &vd[0], l, no_threading ); + apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading ); + + trans_PRECISION_new( &vp[0], &vd[0], op->translation_table, l, no_threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION_new( &vd[2], &vp[1], op->translation_table, l, no_threading ); - trans_PRECISION( vp1, vd1, op->translation_table, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); - trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading ); + //vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + vector_double_minus_new( &vd[3], &vd[2], &vd[1], l, no_threading ); + //diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/ + // global_norm_double( &vd[2], 0, ivs, l, no_threading ); + global_norm_double_new( diff1, &vd[3], l, no_threading ); + global_norm_double_new( diff2, &vd[2], l, no_threading ); - vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vd4, 0, ivs, l, no_threading )/ - global_norm_double( vd3, 0, ivs, l, no_threading ); - - test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff ); + for(int i=0; idepth, diff1[i]/diff2[i] ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, threading ); + apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, threading ); SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) START_LOCKED_MASTER(threading) - trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading ); - vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vd4, 0, ivs, l, no_threading ) / - global_norm_double( vd3, 0, ivs, l, no_threading ); + trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading ); + vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l ); + diff = global_norm_double( &vd[3], 0, ivs, l, no_threading ) / + global_norm_double( &vd[2], 0, ivs, l, no_threading ); if ( diff > EPS_PRECISION ) printf0("\x1b[31m"); @@ -438,9 +384,14 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc END_LOCKED_MASTER(threading) } - - PUBLIC_FREE( vd1, complex_double, 4*ivs ); - PUBLIC_FREE( vp1, complex_PRECISION, 2*ivs ); + + for(int i=0; i<4; i++){ + vector_double_free( &vd[i], l, threading ); + } + + for(int i=0; i<2; i++){ + vector_PRECISION_free( &vp[i], l, threading ); + } START_LOCKED_MASTER(threading) if ( g.method >=4 && g.odd_even ) diff --git a/src/preconditioner.c b/src/preconditioner.c index d7065d4..bd2a401 100644 --- a/src/preconditioner.c +++ b/src/preconditioner.c @@ -22,19 +22,19 @@ #include "main.h" #include "preconditioner.h" -void preconditioner( vector_double phi, vector_double Dphi, vector_double eta, +void preconditioner( vector_double *phi, vector_double *Dphi, vector_double *eta, const int res, level_struct *l, struct Thread *threading ) { if ( g.method == 0 ) vector_double_copy( phi, eta, threading->start_index[l->depth], threading->end_index[l->depth], l ); else if ( g.method < 5 || g.method == 6 || !g.odd_even ) { if ( g.mixed_precision ) { - trans_float( l->sbuf_float[0], eta, l->s_float.op.translation_table, l, threading ); - vcycle_float( l->sbuf_float[1], NULL, l->sbuf_float[0], res, l, threading ); - trans_back_float( phi, l->sbuf_float[1], l->s_float.op.translation_table, l, threading ); + trans_float( &(l->sbuf_float[0]), eta, l->s_float.op.translation_table, l, threading ); + vcycle_float( &(l->sbuf_float[1]), NULL, &(l->sbuf_float[0]), res, l, threading ); + trans_back_float( phi, &(l->sbuf_float[1]), l->s_float.op.translation_table, l, threading ); } else { - trans_double( l->sbuf_double[0], eta, l->s_double.op.translation_table, l, threading ); - vcycle_double( l->sbuf_double[1], NULL, l->sbuf_double[0], res, l, threading ); - trans_back_double( phi, l->sbuf_double[1], l->s_double.op.translation_table, l, threading ); + trans_double( &(l->sbuf_double[0]), eta, l->s_double.op.translation_table, l, threading ); + vcycle_double( &(l->sbuf_double[1]), NULL, &(l->sbuf_double[0]), res, l, threading ); + trans_back_double( phi, &(l->sbuf_double[1]), l->s_double.op.translation_table, l, threading ); } } else { if ( g.mixed_precision ) { @@ -42,25 +42,25 @@ void preconditioner( vector_double phi, vector_double Dphi, vector_double eta, l->sp_float.num_restart = l->n_cy; l->sp_float.initial_guess_zero = res; END_LOCKED_MASTER(threading) - serial_to_oddeven_float( l->sp_float.b, eta, l, threading ); + serial_to_oddeven_float( &(l->sp_float.b), eta, l, threading ); if ( g.method == 6 ) { g5D_solve_oddeven_float( &(l->sp_float), &(l->oe_op_float), l, threading ); } else { solve_oddeven_float( &(l->sp_float), &(l->oe_op_float), l, threading ); } - oddeven_to_serial_float( phi, l->sp_float.x, l, threading ); + oddeven_to_serial_float( phi, &(l->sp_float.x), l, threading ); } else { START_LOCKED_MASTER(threading) l->sp_double.num_restart = l->n_cy; l->sp_double.initial_guess_zero = res; END_LOCKED_MASTER(threading) - serial_to_oddeven_double( l->sp_double.b, eta, l, threading ); + serial_to_oddeven_double( &(l->sp_double.b), eta, l, threading ); if ( g.method == 6 ) { g5D_solve_oddeven_double( &(l->sp_double), &(l->oe_op_double), l, threading ); } else { solve_oddeven_double( &(l->sp_double), &(l->oe_op_double), l, threading ); } - oddeven_to_serial_double( phi, l->sp_double.x, l, threading ); + oddeven_to_serial_double( phi, &(l->sp_double.x), l, threading ); } } diff --git a/src/preconditioner.h b/src/preconditioner.h index 783c70c..d3f0b02 100644 --- a/src/preconditioner.h +++ b/src/preconditioner.h @@ -29,6 +29,6 @@ #include "schwarz_float.h" #include "schwarz_double.h" - void preconditioner( vector_double phi, vector_double Dphi, vector_double eta, + void preconditioner( vector_double *phi, vector_double *Dphi, vector_double *eta, const int res, level_struct *l, struct Thread *threading ); #endif diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index 01becd3..ee5dc4b 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -32,7 +32,8 @@ void smoother_PRECISION_def( level_struct *l ) { if ( g.method == 6 ) { l->p_PRECISION.eval_operator = (l->depth > 0)?g5D_apply_coarse_operator_PRECISION:g5D_plus_clover_PRECISION; } else { - l->p_PRECISION.eval_operator = (l->depth > 0)?apply_coarse_operator_PRECISION:d_plus_clover_PRECISION; + //l->p_PRECISION.eval_operator = (l->depth > 0)?apply_coarse_operator_PRECISION:d_plus_clover_PRECISION; + l->p_PRECISION.eval_operator = d_plus_clover_PRECISION_new; } } @@ -51,16 +52,16 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) { s->index[T] = NULL; s->oe_index[T] = NULL; s->block = NULL; - s->buf1 = NULL; - s->buf2 = NULL; - s->buf3 = NULL; - s->buf4 = NULL; - s->buf5 = NULL; - l->sbuf_PRECISION[0] = NULL; - s->oe_buf[0] = NULL; - s->oe_buf[1] = NULL; - s->oe_buf[2] = NULL; - s->oe_buf[3] = NULL; + vector_PRECISION_init(&(s->buf1)); + vector_PRECISION_init(&(s->buf2)); + vector_PRECISION_init(&(s->buf3)); + vector_PRECISION_init(&(s->buf4)); + vector_PRECISION_init(&(s->buf5)); + vector_PRECISION_init(&(l->sbuf_PRECISION[0])); + vector_PRECISION_init(&(s->oe_buf[0])); + vector_PRECISION_init(&(s->oe_buf[1])); + vector_PRECISION_init(&(s->oe_buf[2])); + vector_PRECISION_init(&(s->oe_buf[3])); s->local_minres_buffer[0] = NULL; s->local_minres_buffer[1] = NULL; s->local_minres_buffer[2] = NULL; @@ -69,25 +70,24 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) { s->num_colors = 0; } - void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { int i, j, n, mu, nu, *bl = l->block_lattice; if ( g.method == 4 ) { - fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, + fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION): (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->sp_PRECISION), l ); } else if ( g.method == 5 ) { - fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, + fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION): (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION), &(l->sp_PRECISION), l ); } else if ( g.method == 6 ) { - fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, + fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?_INNER:_ORDINARY, EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL, (l->depth==0)?(g.odd_even?g5D_apply_schur_complement_PRECISION:g5D_plus_clover_PRECISION): (g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION), @@ -140,18 +140,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { MALLOC( s->block, block_struct, s->num_blocks ); - int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int svs = l->schwarz_vector_size; + int nvec = 1; #ifdef HAVE_TM1p1 svs *= 2; - vs *= 2; + nvec = 2; #endif if ( l->depth == 0 ) { - MALLOC( s->oe_buf[0], complex_PRECISION, 4*vs ); - s->oe_buf[1] = s->oe_buf[0] + vs; - s->oe_buf[2] = s->oe_buf[1] + vs; - s->oe_buf[3] = s->oe_buf[2] + vs; + for ( i=0; i<4; i++ ) { + vector_PRECISION_init( &(s->oe_buf[i]) ); + vector_PRECISION_alloc( &(s->oe_buf[i]), _INNER, nvec, l, no_threading ); + } } n = 0; @@ -172,37 +173,31 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { s->block[i].bt = NULL; MALLOC( s->block[i].bt, int, n ); } - - MALLOC( s->buf1, complex_PRECISION, vs+3*svs ); - s->buf2 = s->buf1 + vs; - s->buf3 = s->buf2 + svs; - s->buf4 = s->buf3 + svs; + vector_PRECISION_init( &(s->buf1) ); + vector_PRECISION_init( &(s->buf2) ); + vector_PRECISION_init( &(s->buf3) ); + vector_PRECISION_init( &(s->buf4) ); + + vector_PRECISION_alloc( &(s->buf1), (l->depth==0)?_INNER:_ORDINARY, nvec, l, no_threading ); + vector_PRECISION_alloc( &(s->buf2), _SCHWARZ, nvec, l, no_threading ); + vector_PRECISION_alloc( &(s->buf3), _SCHWARZ, nvec, l, no_threading ); + vector_PRECISION_alloc( &(s->buf4), _SCHWARZ, nvec, l, no_threading ); - if ( g.method == 1 ) - MALLOC( s->buf5, complex_PRECISION, svs ); - - MALLOC( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs ); - l->sbuf_PRECISION[1] = l->sbuf_PRECISION[0] + vs; + if ( g.method == 1 ){ + vector_PRECISION_init( &(s->buf5) ); + vector_PRECISION_alloc( &(s->buf5), _SCHWARZ, nvec, l, no_threading ); + } + + for ( i=0; i<2; i++ ) { + vector_PRECISION_init( &(l->sbuf_PRECISION[i]) ); + vector_PRECISION_alloc( &(l->sbuf_PRECISION[i]), (l->depth==0)?_INNER:_ORDINARY, nvec, l, no_threading ); + } // these buffers are introduced to make local_minres_PRECISION thread-safe MALLOC( s->local_minres_buffer[0], complex_PRECISION, svs ); MALLOC( s->local_minres_buffer[1], complex_PRECISION, svs ); MALLOC( s->local_minres_buffer[2], complex_PRECISION, svs ); -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - if ( l->depth == 0 ) { - MALLOC_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION ); - MALLOC_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION ); - } -#endif -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if ( l->depth == 0 ) { - MALLOC_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); -#ifdef HAVE_TM1p1 - MALLOC_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); -#endif - } -#endif } @@ -252,32 +247,27 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { FREE( s->block, block_struct, s->num_blocks ); - int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int svs = l->schwarz_vector_size; #ifdef HAVE_TM1p1 svs *= 2; - vs *= 2; #endif - if ( l->depth == 0 ) { - s->oe_buf[1] = NULL; - s->oe_buf[2] = NULL; - s->oe_buf[3] = NULL; - FREE( s->oe_buf[0], complex_PRECISION, 4*vs ); - s->oe_buf[0] = NULL; - } + if ( l->depth == 0 ) + for ( i=0; i<4; i++ ) + vector_PRECISION_free( &(s->oe_buf[i]), l, no_threading ); - - FREE( s->buf1, complex_PRECISION, vs+3*svs ); - s->buf2 = NULL; s->buf3 = NULL; - s->buf4 = NULL; + vector_PRECISION_free( &(s->buf1), l, no_threading ); + vector_PRECISION_free( &(s->buf2), l, no_threading ); + vector_PRECISION_free( &(s->buf3), l, no_threading ); + vector_PRECISION_free( &(s->buf4), l, no_threading ); if ( g.method == 1 ) - FREE( s->buf5, complex_PRECISION, svs ); + vector_PRECISION_free( &(s->buf5), l, no_threading ); operator_PRECISION_free( &(s->op), _SCHWARZ, l ); - FREE( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs ); - l->sbuf_PRECISION[1] = NULL; + for ( i=0; i<2; i++ ) + vector_PRECISION_free( &(l->sbuf_PRECISION[i]), l, no_threading ); FREE( s->local_minres_buffer[0], complex_PRECISION, svs ); FREE( s->local_minres_buffer[1], complex_PRECISION, svs ); @@ -286,20 +276,6 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { s->local_minres_buffer[1] = NULL; s->local_minres_buffer[2] = NULL; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - if ( l->depth == 0 ) { - FREE_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) ); - FREE_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) ); - } -#endif -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if ( l->depth == 0 ) { - FREE_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size ); -#ifdef HAVE_TM1p1 - FREE_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size ); -#endif - } -#endif } @@ -649,7 +625,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc int i, t, z, y, x, mu, nu, index, *it = s->op.index_table, *dt = s->op.table_dim, ls[4], le[4], buf_length[4], link_size; - vector_PRECISION buf[4] = {NULL,NULL,NULL,NULL}, rbuf[4] = {NULL,NULL,NULL,NULL}; + buffer_PRECISION buf[4] = {NULL,NULL,NULL,NULL}, rbuf[4] = {NULL,NULL,NULL,NULL}; config_PRECISION D=s->op.D; for ( mu=0; mu<4; mu++ ) { @@ -694,7 +670,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc for ( y=ls[Y]; yneighbor_rank[2*mu], @@ -715,7 +691,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc for ( y=ls[Y]; yblock_boundary_length; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized; - PRECISION *Dminus = s->op.D_transformed_vectorized; - - for ( int mu=0; mu<4; mu++ ) { - boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, - mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, - mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); - } -#else int i, mu, index, neighbor_index; config_PRECISION D_pt, D = s->op.D; - vector_PRECISION phi_pt, eta_pt; + buffer_PRECISION phi_pt, eta_pt; #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -762,8 +727,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_T_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -776,8 +741,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_T_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -792,8 +757,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Z_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -806,8 +771,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Z_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -822,8 +787,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Y_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -836,8 +801,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Y_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -852,8 +817,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_X_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -866,8 +831,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_X_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -884,8 +849,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_T_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -896,8 +861,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_T_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -910,8 +875,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Z_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -922,8 +887,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Z_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -936,8 +901,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Y_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -948,8 +913,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Y_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -962,8 +927,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_X_PRECISION( buf1, phi_pt ); mvm_PRECISION( buf2, D_pt, buf1 ); mvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -974,8 +939,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_X_PRECISION( buf1, phi_pt ); mvmh_PRECISION( buf2, D_pt, buf1 ); mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -984,28 +949,16 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in #ifdef HAVE_TM1p1 } #endif -#endif } -void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, +void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float - PRECISION *Dplus = s->op.D_vectorized; - PRECISION *Dminus = s->op.D_transformed_vectorized; - - for ( int mu=0; mu<4; mu++ ) { - boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, - mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, - mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); - } -#else int i, mu, index, neighbor_index; config_PRECISION D_pt, D = s->op.D; - vector_PRECISION phi_pt, eta_pt; + buffer_PRECISION phi_pt, eta_pt; #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) { @@ -1016,8 +969,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_T_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1030,8 +983,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_T_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1046,8 +999,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Z_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1060,8 +1013,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Z_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1076,8 +1029,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_Y_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1090,8 +1043,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_Y_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1106,8 +1059,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprp_X_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1120,8 +1073,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 24*neighbor_index; - eta_pt = eta + 24*index; + phi_pt = phi->vector_buffer + 24*neighbor_index; + eta_pt = eta->vector_buffer + 24*index; dprn_X_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1138,8 +1091,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_T_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1150,8 +1103,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_T_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1164,8 +1117,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Z_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1176,8 +1129,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Z_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1190,8 +1143,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_Y_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1202,8 +1155,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_Y_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1216,8 +1169,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prp_X_PRECISION( buf1, phi_pt ); nmvm_PRECISION( buf2, D_pt, buf1 ); nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1228,8 +1181,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, index = s->block[k].bt[i]; neighbor_index = s->block[k].bt[i+1]; D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; + phi_pt = phi->vector_buffer + 12*neighbor_index; + eta_pt = eta->vector_buffer + 12*index; prn_X_PRECISION( buf1, phi_pt ); nmvmh_PRECISION( buf2, D_pt, buf1 ); nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); @@ -1237,41 +1190,15 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, } #ifdef HAVE_TM1p1 } -#endif #endif } -void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, +void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset; - - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; - // plus mu direction - for ( int i=bbl[2*mu]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l ); - } - // minus mu direction - for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); - } - } -#else config_PRECISION D = s->op.D; int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size; @@ -1280,54 +1207,30 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION for ( int i=bbl[2*mu]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*index + link_size*mu; - coarse_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu; - coarse_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } } -#endif } -void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, +void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset; - - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; - // plus mu direction - for ( int i=bbl[2*mu]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l ); - } - // minus mu direction - for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); - } - } -#else int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size; config_PRECISION D = s->op.D; @@ -1336,22 +1239,23 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO for ( int i=bbl[2*mu]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*index + link_size*mu; - coarse_n_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_n_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; + vector_PRECISION phi_pt, eta_pt; + phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index; + eta_pt.vector_buffer = eta->vector_buffer + n*index; config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu; - coarse_n_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_n_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l ); } } -#endif } @@ -1412,18 +1316,19 @@ void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struc } -void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void additive_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) int k, mu, i, nb = s->num_blocks; - vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3, latest_iter2 = s->buf5, swap = NULL; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3), *latest_iter2 = &(s->buf5), *swap = NULL; void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, - (* block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + //vector_PRECISION_init(swap); int nb_thread_start; int nb_thread_end; @@ -1594,16 +1499,13 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v } -void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) int k=0, mu, i, init_res = res, res_comm = res, step; - vector_PRECISION r = s->buf1; - vector_PRECISION Dphi = s->buf4; - vector_PRECISION latest_iter = s->buf2; - vector_PRECISION x = s->buf3; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3); void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, @@ -1612,7 +1514,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, int commdir[8] = {+1,-1,-1,+1,-1,+1,+1,-1}; SYNC_CORES(threading) - + int block_thread_start[8], block_thread_end[8]; for ( i=0; i<8; i++ ) compute_core_start_end_custom(0, s->block_list_length[i], block_thread_start+i, block_thread_end+i, l, threading, 1 ); @@ -1658,9 +1560,9 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, PROF_PRECISION_STOP( _SM3, 1 ); PROF_PRECISION_START( _SM4 ); END_MASTER(threading) - // local minres updates x, r and latest iter + // local minres updates x, r and latest iter block_solve( x, r, latest_iter, s->block[index].start*l->num_lattice_site_var, s, l, no_threading ); - START_MASTER(threading) + START_MASTER(threading) PROF_PRECISION_STOP( _SM4, 1 ); END_MASTER(threading) } @@ -1764,16 +1666,13 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, } -void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) int color, k, mu, i, nb = s->num_blocks, init_res = res; - vector_PRECISION r = s->buf1; - vector_PRECISION Dphi = s->buf4; - vector_PRECISION latest_iter = s->buf2; - vector_PRECISION x = s->buf3; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3); void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, @@ -1980,7 +1879,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE } -void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, +void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_NO_HYPERTHREADS(threading) @@ -1989,7 +1888,7 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p if ( s->num_colors == 2 ) schwarz_PRECISION( phi, D_phi, eta, cycles, res, s, l, no_threading ); else { int color, k, mu, i, nb = s->num_blocks; - vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3; + vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3); void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, @@ -2101,11 +2000,11 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p #ifdef SCHWARZ_RES START_LOCKED_MASTER(threading) - vector_PRECISION true_r = NULL; - - PUBLIC_MALLOC( true_r, complex_PRECISION, l->vector_size ); - vector_PRECISION_define( true_r, 0, 0, l->inner_vector_size, l ); + vector_PRECISION true_r; + vector_PRECISION_init(&true_r); + vector_PRECISION_alloc( &true_r, _ORDINARY, 1, l, threading ); + vector_PRECISION_define( &true_r, 0, 0, l->inner_vector_size, l ); if ( D_phi == NULL ) { for ( mu=0; mu<4; mu++ ) { @@ -2113,24 +2012,24 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p ghost_update_PRECISION( x, mu, -1, &(s->op.c), l ); } for ( i=0; iblock[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_op( &true_r, x, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); } for ( mu=0; mu<4; mu++ ) { ghost_update_wait_PRECISION( x, mu, +1, &(s->op.c), l ); ghost_update_wait_PRECISION( x, mu, -1, &(s->op.c), l ); } for ( i=0; iinner_vector_size, l ); - PRECISION r_norm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading ), + vector_PRECISION_saxpy( &true_r, eta, &true_r, -1, 0, l->inner_vector_size, l ); + PRECISION r_norm = global_norm_PRECISION( &true_r, 0, l->inner_vector_size, l, no_threading ), den = global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading ); r_norm/=den; char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); - PUBLIC_FREE( true_r, complex_PRECISION, l->vector_size ); + vector_PRECISION_free( &true_r, l, threading ); END_LOCKED_MASTER(threading) #endif } @@ -2139,10 +2038,10 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p } -void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_struct *l, struct Thread *threading ) { +void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ) { int i, index; - vector_PRECISION out_pt = out; vector_double in_pt = in; + buffer_PRECISION out_pt = out->vector_buffer; buffer_double in_pt = in->vector_buffer; int start = threading->start_site[l->depth]; int end = threading->end_site[l->depth]; @@ -2153,16 +2052,16 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str if( g.n_flavours == 2 ) for ( i=start; ivector_buffer + 24*index; + in_pt = in->vector_buffer + 24*i; FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; ) } else #endif for ( i=start; ivector_buffer + 12*index; + in_pt = in->vector_buffer + 12*i; FOR12( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; ) } END_NO_HYPERTHREADS(threading) @@ -2170,10 +2069,10 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str } -void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, level_struct *l, struct Thread *threading ) { +void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ) { int i, index; - vector_double out_pt = out; vector_PRECISION in_pt = in; + buffer_double out_pt = out->vector_buffer; buffer_PRECISION in_pt = in->vector_buffer; int start = threading->start_site[l->depth]; int end = threading->end_site[l->depth]; @@ -2184,16 +2083,16 @@ void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, leve if( g.n_flavours == 2 ) for ( i=start; ivector_buffer + 24*index; + out_pt = out->vector_buffer + 24*i; FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; ) } else #endif for ( i=start; ivector_buffer + 12*index; + out_pt = out->vector_buffer + 12*i; FOR12( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; ) } END_NO_HYPERTHREADS(threading) @@ -2201,6 +2100,79 @@ void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, leve } +void trans_PRECISION_new( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ) { + + int i, j, k, index; + buffer_PRECISION out_pt = out->vector_buffer; buffer_double in_pt = in->vector_buffer; + int start = threading->start_site[l->depth]; + int end = threading->end_site[l->depth]; + //compute_core_start_end(0, in->size, &start, &end, l, threading); + + // this function seems to do some data reordering, barriers ensure that everything is in sync + SYNC_CORES(threading) + START_NO_HYPERTHREADS(threading) +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + for ( i=start; ivector_buffer + 24*index; + in_pt = in->vector_buffer + 24*i; + FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; ) + } + else +#endif + for ( i=start; ivector_buffer + 12*index*in->num_vect; + in_pt = in->vector_buffer + 12*i*in->num_vect; + for( k=0; k<12; k++) + for( j=0; jnum_vect; j++){ + *out_pt = (complex_PRECISION) *in_pt; + out_pt++; + in_pt++; + } + } + END_NO_HYPERTHREADS(threading) + SYNC_CORES(threading) +} + + +void trans_back_PRECISION_new( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ) { + + int i, j, k, index; + buffer_double out_pt = out->vector_buffer; buffer_PRECISION in_pt = in->vector_buffer; + int start = threading->start_site[l->depth]; + int end = threading->end_site[l->depth]; + + // this function seems to do some data reordering, barriers ensure that everything is in sync + SYNC_CORES(threading) + START_NO_HYPERTHREADS(threading) +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + for ( i=start; ivector_buffer + 24*index; + out_pt = out->vector_buffer + 24*i; + FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; ) + } + else +#endif + for ( i=start; ivector_buffer + 12*index*in->num_vect; + out_pt = out->vector_buffer + 12*i*in->num_vect; + for( k=0; k<12; k++) + for( j=0; jnum_vect; j++){ + *out_pt = (complex_double) *in_pt; + out_pt++; + in_pt++; + } + } + END_NO_HYPERTHREADS(threading) + SYNC_CORES(threading) +} + + void schwarz_PRECISION_def( schwarz_PRECISION_struct *s, operator_double_struct *op, level_struct *l ) { schwarz_PRECISION_alloc( s, l ); @@ -2214,50 +2186,51 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l START_UNTHREADED_FUNCTION(threading) int mu, i, nb = s->num_blocks; - int svs = l->schwarz_vector_size; int ivs = l->inner_vector_size; - int vs = l->vector_size; void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; void (*op)() = (l->depth==0)?d_plus_clover_PRECISION:apply_coarse_operator_PRECISION; void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op; - vector_PRECISION v1 = NULL, v2 = NULL, v3 = NULL; + vector_PRECISION v1, v2, v3; PRECISION diff; - MALLOC( v1, complex_PRECISION, svs ); - MALLOC( v2, complex_PRECISION, vs ); - MALLOC( v3, complex_PRECISION, vs ); + vector_PRECISION_init( &v1 ); + vector_PRECISION_init( &v2 ); + vector_PRECISION_init( &v3 ); + + vector_PRECISION_alloc( &v1, _SCHWARZ, 1, l, no_threading ); + vector_PRECISION_alloc( &v2, _ORDINARY, 1, l, no_threading ); + vector_PRECISION_alloc( &v3, _ORDINARY, 1, l, no_threading ); - vector_PRECISION_define_random( v1, 0, ivs, l ); + vector_PRECISION_define_random( &v1, 0, ivs, l ); - op( v3, v1, &(s->op), l, no_threading ); + op( &v3, &v1, &(s->op), l, no_threading ); for ( mu=0; mu<4; mu++ ) { - ghost_update_PRECISION( v1, mu, +1, &(s->op.c), l ); - ghost_update_PRECISION( v1, mu, -1, &(s->op.c), l ); + ghost_update_PRECISION( &v1, mu, +1, &(s->op.c), l ); + ghost_update_PRECISION( &v1, mu, -1, &(s->op.c), l ); } for ( mu=0; mu<4; mu++ ) { - ghost_update_wait_PRECISION( v1, mu, +1, &(s->op.c), l ); - ghost_update_wait_PRECISION( v1, mu, -1, &(s->op.c), l ); + ghost_update_wait_PRECISION( &v1, mu, +1, &(s->op.c), l ); + ghost_update_wait_PRECISION( &v1, mu, -1, &(s->op.c), l ); } for ( i=0; iblock[i].start*l->num_lattice_site_var, s, l, no_threading ); - boundary_op( v2, v1, i, s, l, no_threading ); + block_op( &v2, &v1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + boundary_op( &v2, &v1, i, s, l, no_threading ); } - vector_PRECISION_minus( v3, v3, v2, 0, l->inner_vector_size, l ); - diff = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading ) / - global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading ); + vector_PRECISION_minus( &v3, &v3, &v2, 0, l->inner_vector_size, l ); + diff = global_norm_PRECISION( &v3, 0, l->inner_vector_size, l, no_threading ) / + global_norm_PRECISION( &v2, 0, l->inner_vector_size, l, no_threading ); test0_PRECISION("depth: %d, correctness of local residual vector: %le\n", l->depth, diff ); - - FREE( v1, complex_PRECISION, l->schwarz_vector_size ); - FREE( v2, complex_PRECISION, l->vector_size ); - FREE( v3, complex_PRECISION, l->vector_size ); + + vector_PRECISION_free( &v1, l, no_threading ); + vector_PRECISION_free( &v2, l, no_threading ); + vector_PRECISION_free( &v3, l, no_threading ); END_UNTHREADED_FUNCTION(threading) } - diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h index fab1613..2bc22d7 100644 --- a/src/schwarz_generic.h +++ b/src/schwarz_generic.h @@ -24,13 +24,13 @@ struct Thread; - void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, + void block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); - void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, + void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); - void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, + void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); - void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, + void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k, schwarz_PRECISION_struct *s, level_struct *l ); void smoother_PRECISION_def( level_struct *l ); @@ -46,17 +46,19 @@ struct Thread; void schwarz_PRECISION_def( schwarz_PRECISION_struct *s, operator_double_struct *op, level_struct *l ); void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ); - void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void additive_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, + void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); - void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_struct *l, struct Thread *threading ); - void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, level_struct *l, struct Thread *threading ); + void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ); + void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ); + void trans_PRECISION_new( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ); + void trans_back_PRECISION_new( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ); void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); @@ -74,22 +76,4 @@ struct Thread; } } -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float -static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) { - // out1: column major, out2: row major - for ( int i=0; i<3; i++ ) { // column - for ( int j=0; j<3; j++ ) { // row - out1[8*i +j] = creal_PRECISION(in[3*j+i]); - out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]); - out2[8*i +j] = creal_PRECISION(in[j+3*i]); - out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]); - } - out1[8*i+3] = 0.0; - out1[8*i+7] = 0.0; - out2[8*i+3] = 0.0; - out2[8*i+7] = 0.0; - } -} -#endif - #endif diff --git a/src/setup_generic.c b/src/setup_generic.c index 4493bae..e91c3c9 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -32,13 +32,8 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) coarse_operator_PRECISION_alloc( l ); -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); END_LOCKED_MASTER(threading) -#else - END_LOCKED_MASTER(threading) - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); -#endif START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { @@ -91,12 +86,12 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr SYNC_HYPERTHREADS(threading) if ( !l->idle ) { for ( int i=0; inext_level->num_eig_vect,l->num_eig_vect); i++ ) { - restrict_PRECISION( l->next_level->is_PRECISION.test_vector[i], l->is_PRECISION.test_vector[i], l, threading ); + restrict_PRECISION( &(l->next_level->is_PRECISION.test_vector[i]), &(l->is_PRECISION.test_vector[i]), l, threading ); } START_LOCKED_MASTER(threading) for ( int i=MIN(l->next_level->num_eig_vect,l->num_eig_vect); inext_level->num_eig_vect; i++ ) { if ( !l->next_level->idle ) - vector_PRECISION_define_random( l->next_level->is_PRECISION.test_vector[i], 0, + vector_PRECISION_define_random( &(l->next_level->is_PRECISION.test_vector[i]), 0, l->next_level->inner_vector_size, l->next_level ); } END_LOCKED_MASTER(threading) @@ -142,18 +137,19 @@ void read_tv_from_file_PRECISION( level_struct *l, struct Thread *threading ) { int n = l->num_eig_vect, i; char filename[STRINGLENGTH+1]; - vector_double tmp = NULL; + vector_double tmp; + vector_double_init(&tmp); - MALLOC( tmp, complex_double, l->inner_vector_size ); + vector_double_alloc( &tmp, _INNER, 1, l, no_threading ); for ( i=0; iis_PRECISION.test_vector[i], tmp, l->s_PRECISION.op.translation_table, l, no_threading ); + vector_io( (double*)tmp.vector_buffer, filename, _READ, l ); + trans_PRECISION( &(l->is_PRECISION.test_vector[i]), &tmp, l->s_PRECISION.op.translation_table, l, no_threading ); } - FREE( tmp, complex_double, l->inner_vector_size ); + vector_double_free( &tmp, l, no_threading ); END_LOCKED_MASTER(threading) @@ -197,20 +193,22 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T int pi = 1, pn = n*6; #endif vector_PRECISION *buffer = NULL; + int start = threading->start_index[l->depth]; int end = threading->end_index[l->depth]; if ( V == NULL ) { - PUBLIC_MALLOC( buffer, complex_PRECISION*, 3 ); + PUBLIC_MALLOC( buffer, vector_PRECISION, 3 ); START_MASTER(threading) - buffer[0] = NULL; + vector_PRECISION_init(&buffer[0]); END_MASTER(threading) - PUBLIC_MALLOC( buffer[0], complex_PRECISION, l->vector_size*3 ); START_MASTER(threading) - for( i=1; i<3; i++) - buffer[i] = buffer[0] + l->vector_size*i; + for( i=0; i<3; i++){ + vector_PRECISION_init( &buffer[i] ); + vector_PRECISION_alloc( &buffer[i], _ORDINARY, 1, l, threading ); + } if ( g.print > 0 ) printf0("initial definition --- depth: %d\n", l->depth ); #ifdef DEBUG if ( g.print > 0 ) { printf0("\033[0;42m\033[1;37m|"); fflush(0); } @@ -221,16 +219,16 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T for ( k=0; kdepth == 0 ) { START_LOCKED_MASTER(threading) - vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l ); + vector_PRECISION_define_random( &(l->is_PRECISION.test_vector[k]), 0, l->inner_vector_size, l ); END_LOCKED_MASTER(threading) // } - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], 1, _NO_RES, l, threading ); - vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:2, _NO_RES, l, threading ); - vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:3, _NO_RES, l, threading ); - vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); + smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), 1, _NO_RES, l, threading ); + vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l ); + smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), g.method>=4?1:2, _NO_RES, l, threading ); + vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l ); + smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), g.method>=4?1:3, _NO_RES, l, threading ); + vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l ); pc += 6; #ifdef DEBUG @@ -240,12 +238,14 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T #endif } - PUBLIC_FREE( buffer[0], complex_PRECISION, l->vector_size*3 ); - PUBLIC_FREE( buffer, complex_PRECISION*, 3 ); + for( i=0; i<3; i++){ + vector_PRECISION_free( &buffer[i], l, threading ); + } + PUBLIC_FREE( buffer, vector_PRECISION, 3 ); for ( k=0; kis_PRECISION.test_vector[k], l->is_PRECISION.test_vector[k], - 1.0/global_norm_PRECISION( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l, threading ), + vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[k]), &(l->is_PRECISION.test_vector[k]), + 1.0/global_norm_PRECISION( &(l->is_PRECISION.test_vector[k]), 0, l->inner_vector_size, l, threading ), start, end, l ); } @@ -257,27 +257,20 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T } else { for ( i=0; iis_PRECISION.test_vector[i], V[i], l->s_PRECISION.op.translation_table, l, threading ); + trans_PRECISION( &(l->is_PRECISION.test_vector[i]), &V[i], l->s_PRECISION.op.translation_table, l, threading ); } } -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION for ( k=0; kis_PRECISION.interpolation[k], l->is_PRECISION.test_vector[k], start, end, l ); + vector_PRECISION_copy( &(l->is_PRECISION.interpolation[k]), &(l->is_PRECISION.test_vector[k]), start, end, l ); } -#endif testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading ); -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading ); -#else gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading ); define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); -#endif } @@ -286,16 +279,8 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - if ( l->depth > 0 ) - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); - START_LOCKED_MASTER(threading) -#else for ( int i=0; inum_eig_vect; i++ ) { - vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i], + vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]), threading->start_index[l->depth], threading->end_index[l->depth], l ); } gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); @@ -304,7 +289,7 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif + conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( !l->next_level->idle && l->next_level->level > 0 ) { @@ -331,15 +316,16 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ) { if ( !l->idle ) { - vector_PRECISION buf1 = NULL; + vector_PRECISION buf1; gmres_PRECISION_struct gmres; // TODO: bugfix - threading, etc START_LOCKED_MASTER(threading) - MALLOC( buf1, complex_PRECISION, l->vector_size ); + vector_PRECISION_init( &buf1 ); + vector_PRECISION_alloc( &buf1, _ORDINARY, 1, l, no_threading ); fgmres_PRECISION_struct_init( &gmres ); - fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, + fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, _ORDINARY, g.coarse_tol, _COARSE_GMRES, _NOTHING, NULL, apply_coarse_operator_PRECISION, &gmres, l->next_level ); if ( g.odd_even && l->next_level->level == 0 ) @@ -358,7 +344,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s #endif END_MASTER(threading) for ( int i=0; inum_eig_vect; i++ ) { - restrict_PRECISION( gmres.b, l->is_PRECISION.test_vector[i], l, threading ); + restrict_PRECISION( &(gmres.b), &(l->is_PRECISION.test_vector[i]), l, threading ); if ( !l->next_level->idle ) { if ( g.odd_even && l->next_level->level == 0 ) { coarse_solve_odd_even_PRECISION( &gmres, &(l->next_level->oe_op_PRECISION), l->next_level, threading ); @@ -366,10 +352,10 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s fgmres_PRECISION( &gmres, l->next_level, threading ); } } - interpolate3_PRECISION( buf1, gmres.x, l, threading ); - smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, l, threading ); - vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], buf1, - 1.0/global_norm_PRECISION( buf1, 0, l->inner_vector_size, l, threading ), + interpolate3_PRECISION( &buf1, &(gmres.x), l, threading ); + smoother_PRECISION( &buf1, NULL, &(l->is_PRECISION.test_vector[i]), l->post_smooth_iter, _RES, l, threading ); + vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[i]), &buf1, + 1.0/global_norm_PRECISION( &buf1, 0, l->inner_vector_size, l, threading ), threading->start_index[l->depth], threading->end_index[l->depth], l ); pc += l->post_smooth_iter; #ifdef DEBUG @@ -384,16 +370,8 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s END_MASTER(threading) #endif -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - if ( l->depth > 0 ) - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); - coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); - START_LOCKED_MASTER(threading) -#else for ( int i=0; inum_eig_vect; i++ ) - vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i], + vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]), threading->start_index[l->depth], threading->end_index[l->depth], l ); gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) @@ -401,7 +379,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif + conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( !l->next_level->idle && l->next_level->level > 0 ) { @@ -425,7 +403,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s inv_iter_2lvl_extension_setup_PRECISION( setup_iter, l->next_level, threading ); START_LOCKED_MASTER(threading) - FREE( buf1, complex_PRECISION, l->vector_size ); + vector_PRECISION_free( &buf1, l, no_threading ); fgmres_PRECISION_struct_free( &gmres, l ); END_LOCKED_MASTER(threading) } @@ -448,17 +426,19 @@ void test_vector_PRECISION_update( int i, level_struct *l, struct Thread *thread test_vector_PRECISION_update( i, l->next_level, threading ); if ( !l->idle ) - vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], l->p_PRECISION.x, - 1.0/global_norm_PRECISION( l->p_PRECISION.x, 0, l->inner_vector_size, l, threading ), + vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[i]), &(l->p_PRECISION.x), + 1.0/global_norm_PRECISION( &(l->p_PRECISION.x), 0, l->inner_vector_size, l, threading ), threading->start_index[l->depth], threading->end_index[l->depth], l ); } void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ) { - vector_PRECISION v_buf = NULL; + vector_PRECISION v_buf; complex_PRECISION *buffer = NULL; + vector_PRECISION_init(&v_buf); + PUBLIC_MALLOC( buffer, complex_PRECISION, 2*l->num_eig_vect ); START_LOCKED_MASTER(threading) @@ -466,8 +446,8 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre set_kcycle_tol_PRECISION( g.coarse_tol, l ); END_LOCKED_MASTER(threading) SYNC_MASTER_TO_ALL(threading) - - PUBLIC_MALLOC( v_buf, complex_PRECISION, l->vector_size ); + + vector_PRECISION_alloc( &v_buf, _ORDINARY, 1, l, threading ); if ( !l->idle ) { for ( int j=0; jis_PRECISION.test_vector, buffer, 0, l->num_eig_vect, l, threading ); for ( int i=0; inum_eig_vect; i++ ) { - vcycle_PRECISION( l->p_PRECISION.x, NULL, l->is_PRECISION.test_vector[i], _NO_RES, l, threading ); + vcycle_PRECISION( &(l->p_PRECISION.x), NULL, &(l->is_PRECISION.test_vector[i]), _NO_RES, l, threading ); test_vector_PRECISION_update( i, l, threading ); @@ -515,8 +495,8 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre ((double)l->setup_iter))), l->next_level, threading ); } } - - PUBLIC_FREE( v_buf, complex_PRECISION, l->vector_size ); + + vector_PRECISION_free( &v_buf, l, threading ); PUBLIC_FREE( buffer, complex_PRECISION, 2*l->num_eig_vect ); if ( l->depth == 0 ) { @@ -537,12 +517,12 @@ void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth ); for ( int i=0; inum_eig_vect; i++ ) { printf0("vector #%02d: ", i+1 ); - apply_operator_PRECISION( l->vbuf_PRECISION[3], test_vectors[i], &(l->p_PRECISION), l, no_threading ); - coarse_gamma5_PRECISION( l->vbuf_PRECISION[0], l->vbuf_PRECISION[3], 0, l->inner_vector_size, l ); - lambda = global_inner_product_PRECISION( test_vectors[i], l->vbuf_PRECISION[0], 0, l->inner_vector_size, l, no_threading ); - lambda /= global_inner_product_PRECISION( test_vectors[i], test_vectors[i], 0, l->inner_vector_size, l, no_threading ); - vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l ); - mu = global_norm_PRECISION( l->vbuf_PRECISION[1], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[3]), test_vectors+i, &(l->p_PRECISION), l, no_threading ); + coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[0]), &(l->vbuf_PRECISION[3]), 0, l->inner_vector_size, l ); + lambda = global_inner_product_PRECISION( test_vectors+i, &(l->vbuf_PRECISION[0]), 0, l->inner_vector_size, l, no_threading ); + lambda /= global_inner_product_PRECISION( test_vectors+i, test_vectors+i, 0, l->inner_vector_size, l, no_threading ); + vector_PRECISION_saxpy( &(l->vbuf_PRECISION[1]), &(l->vbuf_PRECISION[0]), test_vectors+i, -lambda, 0, l->inner_vector_size, l ); + mu = global_norm_PRECISION( &(l->vbuf_PRECISION[1]), 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors+i, 0, l->inner_vector_size, l, no_threading ); printf0("singular value: %+lf%+lfi, singular vector precision: %le\n", (double)creal(lambda), (double)cimag(lambda), (double)mu ); } printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth ); diff --git a/src/setup_generic.h b/src/setup_generic.h index 6d0ae49..c2926a2 100644 --- a/src/setup_generic.h +++ b/src/setup_generic.h @@ -26,7 +26,7 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *threading ); void coarse_grid_correction_PRECISION_free( level_struct *l ); - void interpolation_PRECISION_define( vector_double* V, level_struct *l, struct Thread *threading ); + void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct Thread *threading ); void iterative_PRECISION_setup( int setup_iter, level_struct *l, struct Thread *threading ); void re_setup_PRECISION( level_struct *l, struct Thread *threading ); void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ); diff --git a/src/solver_analysis.c b/src/solver_analysis.c index 325165e..1c1a20b 100644 --- a/src/solver_analysis.c +++ b/src/solver_analysis.c @@ -50,6 +50,11 @@ void test_routine( level_struct *l, struct Thread *threading ) { if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading ); } + /* if ( g.mixed_precision ) + vector_float_test_routine( l, threading ); + else + vector_double_test_routine( l, threading ); +*/ if ( g.interpolation && g.method > 0 ) { if ( g.mixed_precision ) coarse_operator_float_test_routine( l, threading ); diff --git a/src/sse_blas_vectorized.h b/src/sse_blas_vectorized.h deleted file mode 100644 index df99468..0000000 --- a/src/sse_blas_vectorized.h +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_BLAS_VECTORIZED_H -#define SSE_BLAS_VECTORIZED_H -#ifdef SSE - -static inline void sse_cgem_inverse( const int N, float *A_inverse, float *A, int lda ) { - // generate LU decomp in A - - int i, j, k; - complex_float alpha; - - complex_float tmpA[N*N]; - complex_float tmpA_inverse[N*N]; - - for ( j=0; j0 ) - b[k-1] = 0; - - for ( i=0; i=0; i-- ) { - for ( j=i+1; j= j*offset; i -= SIMD_LENGTH_float ) { - ip = i%offset + 2*(i/offset)*padded; - A_re = _mm_unpacklo_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*ip, A_re ); - _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im ); - A_re = _mm_unpacklo_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*(ip+padded), A_re ); - _mm_store_ps( C+2*(ip+padded)+SIMD_LENGTH_float, A_im ); - } - } - } else { -#endif - __m128 A_re; - __m128 A_im; - __m128 B_re; - __m128 B_im; - __m128 C_re[lda/SIMD_LENGTH_float]; - __m128 C_im[lda/SIMD_LENGTH_float]; - - // deinterleaved load - for ( i=0; i= j*offset; i -= SIMD_LENGTH_float ) { - ip = i%offset + 2*(i/offset)*padded; - A_re = _mm_unpacklo_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*ip, A_re ); - _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im ); - A_re = _mm_unpacklo_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - A_im = _mm_unpackhi_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] ); - _mm_store_ps( C+2*(ip+padded), A_re ); - _mm_store_ps( C+2*(ip+padded)+SIMD_LENGTH_float, A_im ); - } - } - } else { -#endif - __m128 A_re; - __m128 A_im; - __m128 B_re; - __m128 B_im; - __m128 C_re[lda/SIMD_LENGTH_float]; - __m128 C_im[lda/SIMD_LENGTH_float]; - - // deinterleaved load - for ( i=0; inext_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2; - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_float*site); - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - - // index k used for vectorization - for ( k=0; kvector_size + fine_components*component_offset*site); - - // B - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - - -static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3, - complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp ) { - -#ifdef SSE - int k, k1, k2, m, num_eig_vect = l->next_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2; - - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D, each column wise - for ( int n=0; nvector_size + fine_components*component_offset*site); - - k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // C - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_0_1 is the same for all k => broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - - k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - - // B - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - - -static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2, - complex_float *phi, config_float clover, int elements, level_struct *l ) { - -#ifdef SSE - int num_eig_vect = l->num_lattice_site_var/2; - int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2; - complex_float *eta[2] = {eta1, eta2}; - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - - __m128 clover_re; - __m128 clover_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - - // zero output matrices - __m128 zero = _mm_setzero_ps(); - for(int s=0; s<2; s++) { - for(int i=0; ieta1) or 2and3 (->eta2) - eta[1] += num_eig_vect*elements; - for(int s=0; s<2; s++) { - // A and D: column major hermitian, stored as upper triangular - for(int i=0; inum_parent_eig_vect; - int block_step_size = (num_eig_vect * (num_eig_vect+1))/2; - complex_float *eta[2] = {eta1, eta2}; - // U(x) = [ A 0 , A=A*, D=D* - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - - __m128 block_re; - __m128 block_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - - // zero output matrices - __m128 zero = _mm_setzero_ps(); - for(int s=0; s<2; s++) { - for(int i=0; ieta1) or 2and3 (->eta2) - eta[1] += num_eig_vect*elements; - for(int s=0; s<2; s++) { - // A and D: column major hermitian, stored as upper triangular - for(int i=0; inext_level->num_parent_eig_vect, - offset = l->num_parent_eig_vect; - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U(x) = [ A 0 , A=A*, D=D* - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_float*site); - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - -#endif //SSE -#endif diff --git a/src/sse_coarse_operator_generic.c b/src/sse_coarse_operator_generic.c deleted file mode 100644 index cf3b73d..0000000 --- a/src/sse_coarse_operator_generic.c +++ /dev/null @@ -1,962 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#include "sse_coarse_operator.h" - -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION -void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ) { - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) - - double t0, t1; - t0 = MPI_Wtime(); - - int mu, j, n = l->num_eig_vect, num_aggregates = l->is_PRECISION.num_agg, - aggregate_sites = l->num_inner_lattice_sites / num_aggregates, - clover_site_size = (l->num_eig_vect*(l->num_eig_vect*2+1)), - block_site_size = (l->num_eig_vect*(l->num_eig_vect+1)), - D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4, // size of links in all 4 directions - fine_components = l->num_lattice_site_var; - - - - START_LOCKED_MASTER(threading) - operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - // each thread loops overs its aggregates and then over internal d.o.f. - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - for ( j=0; jnext_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO; - for ( j=0; jnext_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO; - for ( j=0; jnext_level->op_PRECISION.odd_proj[j+a*block_site_size] = _COMPLEX_PRECISION_ZERO; - } - - complex_PRECISION *mpi_buffer = NULL; - START_MASTER(threading) - MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 ); - END_MASTER(threading) - - int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]]; - - // set up table for direction flags - int *flags = direction_flags; - if(l->depth == 0) { - // even sites - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - if((x+y+z+t)%2 == 0) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - // odd sites - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - if((x+y+z+t)%2 == 1) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - } else { - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - - complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - if(l->depth == 0) { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site, - direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); - } else { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site, - direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); - } - set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); - } - - // aggregate is done, finalize - set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); - - } - - - SYNC_HYPERTHREADS(threading) - START_LOCKED_MASTER(threading) - // neighbors - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) { - for ( mu=0; mu<4; mu++ ) { - // determine start of buffer for this mu - int start = 0; - for ( int j=0; js_PRECISION.op.c.num_boundary_sites[2*j]; - - // update ghost cells of V[i] - negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l, - SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION ); - } - for ( mu=0; mu<4; mu++ ) { - // finish updating ghostcells of V[i] - negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); - } - } - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - for ( mu=0; mu<4; mu++ ) { - if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0) - continue; - - if(l->depth == 0) - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); - else - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); - set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); - } - } - - // aggregate is done, finalize - for ( mu=0; mu<4; mu++ ) - set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); - } - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - if(l->depth == 0) { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - diagonal_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site ); - } else { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_block_diagonal_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site ); - } - set_coarse_block_diagonal_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); - } - - // aggregate is done, finalize - set_coarse_block_diagonal_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); - } - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) - - coarse_operator_PRECISION_setup_finalize( l, threading ); - - START_MASTER(threading) - FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) ); - - t1 = MPI_Wtime(); - if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 ); - END_MASTER(threading) - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) -} -#endif - -void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - sse_set_coarse_self_coupling_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp ); -} - -void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - sse_set_coarse_block_diagonal_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp ); -} - -void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, - num_eig_vect = l->next_level->num_lattice_site_var/2, - aggregate_size = l->inner_vector_size / num_aggregates, - clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2; - int t1, t2; - - config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/aggregate_size; - clover_pt = clover + aggregate*clover_site_size; - - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nnext_level->num_lattice_site_var/2, - D_link_size = num_eig_vect*num_eig_vect*4; - int t1, t2; - - config_PRECISION D_pt, D = l->next_level->op_PRECISION.D; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg); - D_pt = D + (4*aggregate+mu)*D_link_size; - - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D, each column wise - for ( int n=0; nis_PRECISION.num_agg, - num_eig_vect = l->next_level->num_parent_eig_vect, - aggregate_size = l->inner_vector_size / num_aggregates, - block_site_size = (l->next_level->num_parent_eig_vect*(l->next_level->num_parent_eig_vect+1)); - int t1, t2; - - config_PRECISION block_pt, block = l->next_level->op_PRECISION.odd_proj; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/aggregate_size; - block_pt = block + aggregate*block_site_size; - - // U(x) = [ A 0 , A=A*, D=D* - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; n i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]); - out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]); - // C = -B^dagger - out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]); - out_tmp[(2*i+1)*column_offset + j + vecs] = cimag(clover[offset_to_B + j*vecs+i]); - } - // zero - for(int j=2*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]); - out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); - } - // zero - for(int j=2*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // A - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_column+jp]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] = sign*cimag(clover[offset_to_column+jp]); - // B - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_B + i*vecs+j]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 1*vecs] = cimag(clover[offset_to_B + i*vecs+j]); - // C = -B^dagger - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 3*vecs] = -creal(clover[offset_to_B + j*vecs+i]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 3*vecs] = cimag(clover[offset_to_B + j*vecs+i]); - // D - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] = creal(clover[offset_to_D + offset_to_column+jp]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); - // 0 - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 1*vecs] = - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 0*vecs] = - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] = - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] = 0.0; - } - // zero - for(int j=4*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // E - out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]); - // F - out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_F + offset_to_column+jp]); - out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_F + offset_to_column+jp]); - } - } - tm_term += 2*offset_to_F; - // out_tmp is an alias for the actual output - out_tmp += 2*column_offset*2*vecs; - } -#endif -} - -void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized, - int num_aggregates, int num_eig_vect) { -#ifdef HAVE_TM - int vecs = num_eig_vect; - // in vectorized layout clover is stored column wise, but not split into ABCD - // each column is padded, such that next column can also start at 64B boundary - int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // offset between blocks in clover - int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal - - PRECISION *out_tmp = clover_vectorized; - - // we add/sub the tm term to cloverD_vectorized - // A0B0 E000 0000 - // 0A0B + 0000 - 0E00 - // C0D0 00F0 0000 - // 0C0D 0000 000F - // 0000 0000 0000 - // (column wise, size of zeros such that columns length is multiple of 64B) - - // 4 directions - for ( int a=0; a i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // E - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] += cimag(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] -= sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] -= cimag(tm_term[offset_to_column+jp]); - // F - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(tm_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] += cimag(tm_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] -= sign*creal(tm_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] -= cimag(tm_term[offset_to_F+offset_to_column+jp]); - } - } - tm_term += 2*offset_to_F; - // out_tmp is an alias for the actual output - out_tmp += 2*4*vecs*column_offset; - } -#endif -} - -void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, OPERATOR_TYPE_PRECISION *clover_vectorized, - int num_aggregates, int num_eig_vect) { -#ifdef HAVE_TM1p1 - int vecs = num_eig_vect; - // in vectorized layout clover is stored column wise, but not split into ABCD - // each column is padded, such that next column can also start at 64B boundary - int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // offset between blocks in clover - int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal - - PRECISION *out_tmp = clover_vectorized; - - // we add the eps term to cloverD_vectorized - // A0B0 0E00 - // 0A0B + E000 - // C0D0 000F - // 0C0D 00F0 - // 0000 0000 - // (column wise, size of zeros such that columns length is multiple of 64B) - - // 4 directions - for ( int a=0; a i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - // E - out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] += sign*creal(eps_term[offset_to_column+jp]); - out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] += cimag(eps_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(eps_term[offset_to_column+jp]); - out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] += cimag(eps_term[offset_to_column+jp]); - // F - out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]); - out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]); - } - } - eps_term += 2*offset_to_F; - // out_tmp is an alias for the actual output - out_tmp += 2*4*vecs*column_offset; - } -#endif -} - -void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site, int *direction_flags ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int index_bw; - int index_fw; - int *neighbor = s->op.neighbor_table; - int *backward_neighbor = s->op.backward_neighbor_table; - complex_PRECISION *phi_pt; - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - int n = l->num_lattice_site_var; - int D_site_offset = 4*n*n; - int D_link_offset = n*n; - int clover_offset = (n*(n+1))/2*site; - - coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l ); - - for(int mu=0; mu<4; mu++) { - index_fw = neighbor[5*site+1 + mu]; - index_bw = backward_neighbor[5*site+1 + mu]; - - // from backward - if ( direction_flags[2*mu+0] == 1 ) { - D_pt = D + D_site_offset*index_bw + D_link_offset*mu; - phi_pt = phi + site_offset*index_bw; - coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); - } - - // from forward - if ( direction_flags[2*mu+1] == 1 ) { - D_pt = D + D_site_offset*site + D_link_offset*mu; - phi_pt = phi + site_offset*index_fw; - coarse_spinwise_n_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); - } - } -} - -void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int n = l->num_parent_eig_vect; - int block_offset = (n*(n+1))*site; - - sse_coarse_aggregate_block_diagonal_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+block_offset, offset, l ); -} - -void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, - schwarz_PRECISION_struct *s, level_struct *l, int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int index_fw; - int *neighbor = s->op.neighbor_table; - complex_PRECISION *phi_pt; - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - int n = l->num_lattice_site_var; - int D_site_offset = 4*n*n; - int D_link_offset = n*n; - - vector_PRECISION_define( eta1, 0, 0, n*offset, l ); - vector_PRECISION_define( eta2, 0, 0, n*offset, l ); - - // requires the positive boundaries of phi to be communicated before - index_fw = neighbor[5*site+1 + mu]; - D_pt = D + D_site_offset*site + D_link_offset*mu; - phi_pt = phi + site_offset*index_fw; - coarse_spinwise_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); -} - - -void coarse_spinwise_site_self_couplings_PRECISION_vectorized( - complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) { - - sse_coarse_spinwise_site_self_couplings_PRECISION( eta1, eta2, phi, clover, elements, l ); -} - -#endif diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h deleted file mode 100644 index fb7391a..0000000 --- a/src/sse_coarse_operator_generic.h +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_COARSE_OPERATOR_PRECISION_HEADER - #define SSE_COARSE_OPERATOR_PRECISION_HEADER - - #ifdef SSE - - #include "blas_vectorized.h" - - void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ); - void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - // here we do not check whether site is really on boundary, caller is responsible for that - // tmp is used to store coarse operator with padding, until sum over all sites has been done - void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *buffer1, complex_PRECISION *buffer2, - complex_PRECISION *V, const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_block_diagonal_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - - void copy_coarse_operator_to_vectorized_layout_PRECISION(config_PRECISION D, - OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect); - // fw and bw links have a symmetry that allows constructing one from another, see, e.g., coarse_hopp_PRECISION - // for vectorization we store the operator for both cases, the "daggered" links need this transformed layout - void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D, - OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect); - void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(config_PRECISION clover, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - - void coarse_spinwise_site_self_couplings_PRECISION_vectorized( - complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ); - - void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ); - - void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ); - - void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site); - - - static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, - OPERATOR_TYPE_PRECISION *D, level_struct *l ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int nv = l->num_parent_eig_vect; - int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgenmv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta); -#endif - } - static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, - OPERATOR_TYPE_PRECISION *D, level_struct *l ) { -#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION - int nv = l->num_parent_eig_vect; - int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgemv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta); -#endif - } - - static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, - operator_PRECISION_struct *op, int start, int end, level_struct *l ) { -#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - int site_size = l->num_lattice_site_var; - int lda = SIMD_LENGTH_PRECISION*((site_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); -#ifdef HAVE_TM1p1 - OPERATOR_TYPE_PRECISION *clover = (g.n_flavours == 2) ? op->clover_doublet_vectorized:op->clover_vectorized; -#else - OPERATOR_TYPE_PRECISION *clover = op->clover_vectorized; -#endif - for(int i=start; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) - for(int s=0; s<2; s++) { - // t is the row of the input matrix (in 2x2 block form) - for(int t=0; t<2; t++) { - for(int i=0; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) - for(int s=0; s<2; s++) { - // t is the row of the input matrix (in 2x2 block form) - for(int t=0; t<2; t++) { - for(int i=0; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // A* - for(int i=0; i1?((k)*3+6):((k)*3)) -#define index_d_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) + gamma_offset[mu][spin] ] -#define index_d_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) - gamma_offset[mu][spin] +1 ] - -#define neighbor_coupling_file "sse_dirac_su3local.h" - -void prp_double( complex_double *prn[4], complex_double *phi, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end = (double*)(phi+end); - double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128d phi_pt1_re; __m128d phi_pt1_im; - - sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0) ); - __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt+4,mu,0), index_re(phi_pt,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt+4,mu,0), index_im(phi_pt,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - phi_pt += 24; - } -} - - -void prp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end = (float*)(phi+end); - float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] ); - __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0), - index_re(phi_pt+4,mu,0), index_re(phi_pt,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0), - index_im(phi_pt+4,mu,0), index_im(phi_pt,mu,1) ); - - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[24], phi_pt[26] ); - phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[25], phi_pt[27] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1), - index_re(phi_pt+24,mu,0), index_re(phi_pt+26,mu,0) ); - __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1), - index_im(phi_pt+24,mu,0), index_im(phi_pt+26,mu,0) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[28], phi_pt[30], phi_pt[32], phi_pt[34] ); - phi_pt1_im = _mm_setr_ps( phi_pt[29], phi_pt[31], phi_pt[33], phi_pt[35] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt+28,mu,0), index_re(phi_pt+24,mu,1), - index_re(phi_pt+26,mu,1), index_re(phi_pt+28,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt+28,mu,0), index_im(phi_pt+24,mu,1), - index_im(phi_pt+26,mu,1), index_im(phi_pt+28,mu,1) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt+=48; - } -} - - -void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end = (double*)(phi+end); - double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128d phi_pt1_re; __m128d phi_pt1_im; - - sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+12, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+16, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - sse_complex_deinterleaved_load_pd( phi_pt+20, &phi_pt1_re, &phi_pt1_im ); - for ( int mu=0; mu<4; mu++) { - __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) ); - __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) ); - __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); - __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); - sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); - pr[mu] += 2*SIMD_LENGTH_double; - } - - phi_pt += 48; - } -} - - -void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end = (float*)(phi+end); - float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)}; - - while ( phi_pt < phi_end ) { - - __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] ); - __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), - index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), - index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) ); - - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[12], phi_pt[14] ); - phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[13], phi_pt[15] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), - index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) ); - __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), - index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt1_re = _mm_setr_ps( phi_pt[16], phi_pt[18], phi_pt[20], phi_pt[22] ); - phi_pt1_im = _mm_setr_ps( phi_pt[17], phi_pt[19], phi_pt[21], phi_pt[23] ); - for ( int mu=0; mu<4; mu++) { - __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1), - index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) ); - __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1), - index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) ); - __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); - __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); - - sse_complex_interleaved_store( res_re, res_im, pr[mu] ); - pr[mu] += 8; - } - - phi_pt+=48; - } -} - - -void prn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end_pt = (double*)(phi+end); - double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; - double *D_pt = ((double*)(op->D))+2*(start*3); - int *nb_pt = neighbor+((start/12)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128d in_re[3]; - __m128d in_im[3]; - - for ( int i=0; i<3; i++ ) { - in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] ); - in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); - } - - for ( int mu=0; mu<4; mu++ ) { - - __m128d v_re[3]; - __m128d v_im[3]; - - // calc spin projection - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( index_re(phi_pt+2*i,mu,0), index_re(phi_pt+2*i,mu,1) ); - v_im[i] = _mm_setr_pd( index_im(phi_pt+2*i,mu,0), index_im(phi_pt+2*i,mu,1) ); - v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); - v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); - } - - { - __m128d res_re[3]; - __m128d res_im[3]; - // load su(3) matrix and multiply - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] ); - cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[6+2*i] ); - buf_im = _mm_set1_pd( D_pt[7+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[12+2*i] ); - buf_im = _mm_set1_pd( D_pt[13+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - - { - double *pr_pt = pr[mu]+2*6*(*(nb_pt)); - for ( int i=0; i<3; i++ ) { - __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); - __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); - _mm_storeu_pd( pr_pt+0+2*i, out1 ); - _mm_storeu_pd( pr_pt+6+2*i, out2 ); - } - } - } - - D_pt += 18; - nb_pt++; - } - - phi_pt += 12*2; - } - -} - - -void prn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end_pt = (float*)(phi+end); - float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])}; - float *D_pt = (float*)(op->D_transformed_vectorized+2*(start*4)); - int *nb_pt = neighbor+((start/12)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128 in1[2]; - __m128 in2[2]; - - in1[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 ); - in1[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 ); - in2[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 ); - in2[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res1[2]; - __m128 res2[2]; - - { - // calc spin0 projection - res1[0] = _mm_setr_ps( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0), index_re(phi_pt+4,mu,0), 0 ); - res1[1] = _mm_setr_ps( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0), index_im(phi_pt+4,mu,0), 0 ); - __m128 in1_re = _mm_add_ps( in1[0], res1[0] ); - __m128 in1_im = _mm_add_ps( in1[1], res1[1] ); - - // calc spin1 projection - res1[0] = _mm_setr_ps( index_re(phi_pt,mu,1), index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1), 0 ); - res1[1] = _mm_setr_ps( index_im(phi_pt,mu,1), index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1), 0 ); - __m128 in2_re = _mm_add_ps( in2[0], res1[0] ); - __m128 in2_im = _mm_add_ps( in2[1], res1[1] ); - - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - } - - { - __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); - __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); - __m128 buf3 = _mm_unpacklo_ps( res2[0], res2[1] ); - - { - __m128 buf4 = _mm_unpackhi_ps( res2[0], res2[1] ); - buf2 = _mm_movelh_ps( buf2, buf3 ); - buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - } - { - float *pr_pt = pr[mu]+2*6*(*nb_pt); - _mm_storeu_ps( pr_pt, buf1 ); - _mm_storeu_ps( pr_pt+4, buf2 ); - _mm_storeu_ps( pr_pt+8, buf3 ); - } - } - nb_pt++; - D_pt += 24; - } - - phi_pt += 24; - } -} - - -void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) { - - double *phi_pt = (double*)(phi+start); - double *phi_end_pt = (double*)(phi+end); - double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; - double *D_pt = ((double*)(op->D))+2*(start/24*36); - int *nb_pt = neighbor+((start/24)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128d in_re[6]; - __m128d in_im[6]; - - for ( int i=0; i<3; i++ ) { - in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] ); - in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); - } - for ( int i=3; i<6; i++ ) { - in_re[i] = _mm_setr_pd( phi_pt[2*i+6], phi_pt[2*i+12] ); - in_im[i] = _mm_setr_pd( phi_pt[2*i+7], phi_pt[2*i+13] ); - } - - for ( int mu=0; mu<4; mu++ ) { - - __m128d v_re[6]; - __m128d v_im[6]; - - // calc spin projection - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( index_d_re(phi_pt+2*i,mu,0), index_d_re(phi_pt+2*i,mu,1) ); - v_im[i] = _mm_setr_pd( index_d_im(phi_pt+2*i,mu,0), index_d_im(phi_pt+2*i,mu,1) ); - v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); - v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( index_d_re(phi_pt+6+2*i,mu,0), index_d_re(phi_pt+6+2*i,mu,1) ); - v_im[i] = _mm_setr_pd( index_d_im(phi_pt+6+2*i,mu,0), index_d_im(phi_pt+6+2*i,mu,1) ); - v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); - v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); - } - - { - __m128d res_re[6]; - __m128d res_im[6]; - // load su(3) matrix and multiply - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] ); - cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_conj_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[6+2*i] ); - buf_im = _mm_set1_pd( D_pt[7+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[12+2*i] ); - buf_im = _mm_set1_pd( D_pt[13+2*i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_conj_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - - { - double *pr_pt = pr[mu]+2*12*(*(nb_pt)); - for ( int i=0; i<3; i++ ) { - __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); - __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); - _mm_storeu_pd( pr_pt+0+2*i, out1 ); - _mm_storeu_pd( pr_pt+6+2*i, out2 ); - } - for ( int i=3; i<6; i++ ) { - __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); - __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); - _mm_storeu_pd( pr_pt+ 6+2*i, out1 ); - _mm_storeu_pd( pr_pt+12+2*i, out2 ); - } - } - } - - D_pt += 18; - nb_pt++; - } - - phi_pt += 24*2; - } - -} - - -void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { - - float *phi_pt = (float*)(phi+start); - float *phi_end_pt = (float*)(phi+end); - float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])}; - float *D_pt = (float*)(op->D_transformed_vectorized+2*(start/24*48)); - int *nb_pt = neighbor+((start/24)*4); - - while ( phi_pt < phi_end_pt ) { - - __m128 in11[2]; - __m128 in21[2]; - __m128 in12[2]; - __m128 in22[2]; - - in11[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 ); - in11[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 ); - in21[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 ); - in21[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 ); - in12[0] = _mm_setr_ps( phi_pt[12], phi_pt[14], phi_pt[16], 0 ); - in12[1] = _mm_setr_ps( phi_pt[13], phi_pt[15], phi_pt[17], 0 ); - in22[0] = _mm_setr_ps( phi_pt[18], phi_pt[20], phi_pt[22], 0 ); - in22[1] = _mm_setr_ps( phi_pt[19], phi_pt[21], phi_pt[23], 0 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res11[2]; - __m128 res21[2]; - __m128 res12[2]; - __m128 res22[2]; - - { - // calc spin0 projection - res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), index_d_re(phi_pt+4,mu,0), 0 ); - res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), index_d_im(phi_pt+4,mu,0), 0 ); - __m128 in11_re = _mm_add_ps( in11[0], res11[0] ); - __m128 in11_im = _mm_add_ps( in11[1], res11[1] ); - - // calc spin1 projection - res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,1), index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), 0 ); - res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,1), index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), 0 ); - __m128 in21_re = _mm_add_ps( in21[0], res11[0] ); - __m128 in21_im = _mm_add_ps( in21[1], res11[1] ); - - // calc spin0 projection - res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0), index_d_re(phi_pt+16,mu,0), 0 ); - res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0), index_d_im(phi_pt+16,mu,0), 0 ); - __m128 in12_re = _mm_add_ps( in12[0], res12[0] ); - __m128 in12_im = _mm_add_ps( in12[1], res12[1] ); - - // calc spin1 projection - res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,1), index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1), 0 ); - res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,1), index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1), 0 ); - __m128 in22_re = _mm_add_ps( in22[0], res12[0] ); - __m128 in22_im = _mm_add_ps( in22[1], res12[1] ); - - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) ); - __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) ); - cmul_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) ); - __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); - } - { - __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) ); - __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) ); - cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); - } - } - } - - float *pr_pt = pr[mu]+2*12*(*nb_pt); - { - __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] ); - __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] ); - __m128 buf3 = _mm_unpacklo_ps( res21[0], res21[1] ); - - { - __m128 buf4 = _mm_unpackhi_ps( res21[0], res21[1] ); - buf2 = _mm_movelh_ps( buf2, buf3 ); - buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - } - { - _mm_storeu_ps( pr_pt, buf1 ); - _mm_storeu_ps( pr_pt+4, buf2 ); - _mm_storeu_ps( pr_pt+8, buf3 ); - } - } - { - __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] ); - __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] ); - __m128 buf3 = _mm_unpacklo_ps( res22[0], res22[1] ); - - { - __m128 buf4 = _mm_unpackhi_ps( res22[0], res22[1] ); - buf2 = _mm_movelh_ps( buf2, buf3 ); - buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - } - { - _mm_storeu_ps( pr_pt+12, buf1 ); - _mm_storeu_ps( pr_pt+16, buf2 ); - _mm_storeu_ps( pr_pt+20, buf3 ); - } - } - nb_pt++; - D_pt += 24; - } - - phi_pt += 48; - } -} - - -void pbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) { - - double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; - double *eta_pt = (double*)(eta+start); - - __m128d gamma0[4]; - __m128d gamma1[4]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][0]], gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][1]], gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - for ( int i=start; iD))+2*(start*3); - double *eta_pt = (double*)(eta+start); - double *eta_end_pt = (double*)(eta+end); - double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; - int *nb_pt = neighbor+((start/12)*4); - - __m128d gamma0[4]; - __m128d gamma1[4]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128d res[12]; - for ( int i=0; i<12; i++ ) { - res[i] = _mm_loadu_pd( eta_pt + 2*i ); - } - - // --------------- - // mu = T - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) ); - res[3*gamma_co[T][0]+i] = _mm_sub_pd( res[3*gamma_co[T][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[3+i]) ); - res[3*gamma_co[T][1]+i] = _mm_sub_pd( res[3*gamma_co[T][1]+i], buf1 ); - } - } - } - // --------------- - // mu = Z - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) ); - res[3*gamma_co[Z][0]+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) ); - res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 ); - } - } - } - // --------------- - // mu = Y - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) ); - res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) ); - res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 ); - } - } - } - // --------------- - // mu = X - { - __m128d res_re[3]; - __m128d res_im[3]; - { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[6]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - for ( int i=0; i<6; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) ); - res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) ); - res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 ); - } - } - } - // --------------- - - for ( int i=0; i<12; i++ ) { - _mm_storeu_pd( eta_pt + 2*i, res[i] ); - } - eta_pt+=24; - } - -} - - -void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, - int *neighbor, int start, int end ) { - - float *D_pt = (float*)(op->D_vectorized+2*(start*4)); - float *eta_pt = (float*)(eta+start); - float *eta_end_pt = (float*)(eta+end); - float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])}; - int *nb_pt = neighbor+((start/12)*4); - - __m128 gamma0[4][2]; - __m128 gamma1[4][2]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] ); - gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] ); - gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128 eta_lo1 = _mm_loadu_ps( eta_pt ); - __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 ); - __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 ); - __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 ); - - __m128 eta2_lo[2]; - __m128 eta2_hi[2]; - - eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 ); - eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 ); - eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 ); - eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res1[2]; - __m128 res2[2]; - - { - int j = 2*6*(*nb_pt); - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) ); - cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) ); - cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - } - } - - { - // store spin0 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); - __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); - eta_lo1 = _mm_sub_ps( eta_lo1, buf1 ); - eta_lo2 = _mm_sub_ps( eta_lo2, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); - eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); - } - - { - // store spin1 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] ); - __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] ); - eta_hi1 = _mm_sub_ps( eta_hi1, buf1 ); - eta_hi2 = _mm_sub_ps( eta_hi2, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); - eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); - } - - nb_pt++; - D_pt += 24; - } - - _mm_storeu_ps( eta_pt, eta_lo1 ); - _mm_storeu_ps( eta_pt+4, eta_lo2 ); - _mm_storeu_ps( eta_pt+6, eta_hi1 ); - _mm_storeu_ps( eta_pt+10, eta_hi2 ); - _mm_storeu_ps( eta_pt+12, eta2_lo[0] ); - _mm_storeu_ps( eta_pt+14, eta2_hi[0] ); - _mm_storeu_ps( eta_pt+18, eta2_lo[1] ); - _mm_storeu_ps( eta_pt+20, eta2_hi[1] ); - - eta_pt += 24; - } - -} - - -void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, - int *neighbor, int start, int end ) { - - double *D_pt = ((double*)(op->D))+2*(start/24*36); - double *eta_pt = (double*)(eta+start); - double *eta_end_pt = (double*)(eta+end); - double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; - int *nb_pt = neighbor+((start/24)*4); - - __m128d gamma0[4]; - __m128d gamma1[4]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128d res[24]; - for ( int i=0; i<24; i++ ) { - res[i] = _mm_loadu_pd( eta_pt + 2*i ); - } - - // --------------- - // mu = T - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[T]+j+6+2*i), *(pr[T]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[T]+j+7+2*i), *(pr[T]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) ); - res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+3]) ); - res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i+6]) ); - res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+9]) ); - res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - // mu = Z - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Z]+j+6+2*i), *(pr[Z]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Z]+j+7+2*i), *(pr[Z]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) ); - res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+3]) ); - res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i+6]) ); - res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+9]) ); - res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - // mu = Y - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[Y]+j+6+2*i), *(pr[Y]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[Y]+j+7+2*i), *(pr[Y]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) ); - res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+3]) ); - res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i+6]) ); - res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+9]) ); - res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - // mu = X - { - __m128d res_re[6]; - __m128d res_im[6]; - { - __m128d v_re[6]; - __m128d v_im[6]; - int j = 2*12*(*nb_pt); - - for ( int i=0; i<3; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) ); - } - for ( int i=3; i<6; i++ ) { - v_re[i] = _mm_setr_pd( *(pr[X]+j+6+2*i), *(pr[X]+j+12+2*i) ); - v_im[i] = _mm_setr_pd( *(pr[X]+j+7+2*i), *(pr[X]+j+13+2*i) ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); - __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); - cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); - cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[2+6*i] ); - buf_im = _mm_set1_pd( D_pt[3+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); - buf_re = _mm_set1_pd( D_pt[4+6*i] ); - buf_im = _mm_set1_pd( D_pt[5+6*i] ); - cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); - cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); - } - D_pt += 18; - nb_pt++; - } - { - __m128d in[12]; - in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); - in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); - in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); - - in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); - in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); - in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - - in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); - in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); - in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); - - in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); - in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); - in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); - - for ( int i=0; i<12; i++ ) { - res[i] = _mm_sub_pd( res[i], in[i] ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) ); - res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+3]) ); - res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i+6]) ); - res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i], buf1 ); - } - for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+9]) ); - res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i], buf1 ); - } - } - } - // --------------- - - for ( int i=0; i<24; i++ ) { - _mm_storeu_pd( eta_pt + 2*i, res[i] ); - } - eta_pt+=48; - } - -} - - -void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, - int *neighbor, int start, int end ) { - - float *D_pt = (float*)(op->D_vectorized+2*(start/24*48)); - float *eta_pt = (float*)(eta+start); - float *eta_end_pt = (float*)(eta+end); - float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])}; - int *nb_pt = neighbor+((start/24)*4); - - __m128 gamma0[4][2]; - __m128 gamma1[4][2]; - - for ( int mu=0; mu<4; mu++ ) { - gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] ); - gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] ); - gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] ); - gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] ); - } - - while( eta_pt < eta_end_pt ) { - - __m128 eta_lo1 = _mm_loadu_ps( eta_pt ); - __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 ); - __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 ); - __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 ); - __m128 eta_lo3 = _mm_loadu_ps( eta_pt + 12 ); - __m128 eta_lo4 = _mm_loadu_ps( eta_pt + 16 ); - __m128 eta_hi3 = _mm_loadu_ps( eta_pt + 18 ); - __m128 eta_hi4 = _mm_loadu_ps( eta_pt + 22 ); - - __m128 eta2_lo[4]; - __m128 eta2_hi[4]; - - eta2_lo[0] = _mm_loadu_ps( eta_pt + 24 ); - eta2_hi[0] = _mm_loadu_ps( eta_pt + 26 ); - eta2_lo[1] = _mm_loadu_ps( eta_pt + 30 ); - eta2_hi[1] = _mm_loadu_ps( eta_pt + 32 ); - eta2_lo[2] = _mm_loadu_ps( eta_pt + 36 ); - eta2_hi[2] = _mm_loadu_ps( eta_pt + 38 ); - eta2_lo[3] = _mm_loadu_ps( eta_pt + 42 ); - eta2_hi[3] = _mm_loadu_ps( eta_pt + 44 ); - - for ( int mu=0; mu<4; mu++ ) { - __m128 res1[4]; - __m128 res2[4]; - - { - int j = 2*12*(*nb_pt); - // load 1st part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt ); - __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) ); - cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) ); - cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+12) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+13) ); - cmul( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+18) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+19) ); - cmul( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); - } - } - // load 2nd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+14) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+15) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+20) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+21) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); - } - } - // load 3rd part of su(3) matrix and multiply - { - __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); - __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+16) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+17) ); - cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); - } - { - __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+22) ); - __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+23) ); - cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); - } - } - } - - { - // store spin0 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); - __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); - eta_lo1 = _mm_sub_ps( eta_lo1, buf1 ); - eta_lo2 = _mm_sub_ps( eta_lo2, buf2 ); - } - { - __m128 buf1 = _mm_unpacklo_ps( res1[2], res1[3] ); - __m128 buf2 = _mm_unpackhi_ps( res1[2], res1[3] ); - eta_lo3 = _mm_sub_ps( eta_lo3, buf1 ); - eta_lo4 = _mm_sub_ps( eta_lo4, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - { - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); - eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); - } - { - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[2+gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[3-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][2]], buf3 ); - eta2_hi[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][2]], buf4 ); - } - } - { - // store spin1 contribution - { - __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] ); - __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] ); - eta_hi1 = _mm_sub_ps( eta_hi1, buf1 ); - eta_hi2 = _mm_sub_ps( eta_hi2, buf2 ); - } - { - __m128 buf1 = _mm_unpacklo_ps( res2[2], res2[3] ); - __m128 buf2 = _mm_unpackhi_ps( res2[2], res2[3] ); - eta_hi3 = _mm_sub_ps( eta_hi3, buf1 ); - eta_hi4 = _mm_sub_ps( eta_hi4, buf2 ); - } - - // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - { - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); - eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); - } - { - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[2+gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[3-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][3]], buf3 ); - eta2_hi[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][3]], buf4 ); - } - } - nb_pt++; - D_pt += 24; - } - - _mm_storeu_ps( eta_pt, eta_lo1 ); - _mm_storeu_ps( eta_pt+4, eta_lo2 ); - _mm_storeu_ps( eta_pt+6, eta_hi1 ); - _mm_storeu_ps( eta_pt+10, eta_hi2 ); - _mm_storeu_ps( eta_pt+12, eta_lo3 ); - _mm_storeu_ps( eta_pt+16, eta_lo4 ); - _mm_storeu_ps( eta_pt+18, eta_hi3 ); - _mm_storeu_ps( eta_pt+22, eta_hi4 ); - _mm_storeu_ps( eta_pt+24, eta2_lo[0] ); - _mm_storeu_ps( eta_pt+26, eta2_hi[0] ); - _mm_storeu_ps( eta_pt+30, eta2_lo[1] ); - _mm_storeu_ps( eta_pt+32, eta2_hi[1] ); - _mm_storeu_ps( eta_pt+36, eta2_lo[2] ); - _mm_storeu_ps( eta_pt+38, eta2_hi[2] ); - _mm_storeu_ps( eta_pt+42, eta2_lo[3] ); - _mm_storeu_ps( eta_pt+44, eta2_hi[3] ); - - eta_pt += 48; - } - -} - - -void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_pZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_pY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_pX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_plus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_pT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_pZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_pY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_pX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_plus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void block_oddeven_nplus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_npT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_npZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_npY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_npX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#include neighbor_coupling_file -#undef MU -#undef UPD -} -void block_oddeven_nplus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_npT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_npZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_npY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_npX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_nplus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void block_oddeven_minus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_mT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_mZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_mY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_mX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_minus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_mT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_mZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_mY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_mX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_minus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void block_oddeven_nminus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void block_oddeven_nmT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nmZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nmY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nmX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef MU -#undef UPD -} -void block_oddeven_nminus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: block_oddeven_nmT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: block_oddeven_nmZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: block_oddeven_nmY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: block_oddeven_nmX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("block_oddeven_nminus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_nminus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_nmT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nmZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nmY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nmX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nminus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_nmT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_nmZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_nmY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_nmX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_nminus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_nplus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_npT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 0 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_npZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 1 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_npY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 2 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_npX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_add_ps -#define MU 3 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_nplus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_npT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_npZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_npY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_npX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_nplus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_minus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_mT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_mZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_mY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_mX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#define BOUNDARY -#define MINUSDIR -#include neighbor_coupling_file -#undef MINUSDIR -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_minus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_mT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_mZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_mY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_mX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_minus_coupling_float: invalid mu=%d\n", mu ); - } -} - - -void boundary_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } -void boundary_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 0 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_pZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 1 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_pY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 2 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_pX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { -#define UPD _mm_sub_ps -#define MU 3 -#define BOUNDARY -#include neighbor_coupling_file -#undef BOUNDARY -#undef MU -#undef UPD -} -void boundary_plus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) { - switch ( mu ) { - case T: boundary_pT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Z: boundary_pZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case Y: boundary_pY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - case X: boundary_pX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break; - default: error0("boundary_plus_coupling_float: invalid mu=%d\n", mu ); - } -} - - - - - -static inline int sse_clover_real_index( int i, int j ) { - return (i/SIMD_LENGTH_float)*12*SIMD_LENGTH_float + SIMD_LENGTH_float*j*2 + i%SIMD_LENGTH_float; -} - -static inline int sse_clover_imag_index( int i, int j ) { - return (i/SIMD_LENGTH_float)*12*SIMD_LENGTH_float + SIMD_LENGTH_float*(j*2+1) + i%SIMD_LENGTH_float; -} - -void sse_set_clover_double( double *out, complex_double *in ) { } - -void sse_set_clover_float( float *out, complex_float *in ) { - - int index; - float sign = 0.0; - for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { - for ( int j=0; j<6; j++ ) { - for ( int i=0; i i+k ) { - // upper triangle - index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1)); - sign = 1.0; - } else { - // lower triangle, j < i+k - index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1)); - sign = -1.0; - } - } else { - // i+k >= 6 - // second 6-by-6 matrix - if ( j > i+k-6 ) { - // upper triangle - index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1)); - sign = 1.0; - } else { - // j < i+k-6 - // lower triangle - index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1)); - sign = -1.0; - } - } - out[ sse_clover_real_index(i+k,j) ] = creal_float( (complex_float)in[index] ); - out[ sse_clover_imag_index(i+k,j) ] = sign*cimag_float( (complex_float)in[index] ); - } - } - } -} - -void sse_set_clover_doublet_double( double *out, complex_double *in ) { } - -void sse_set_clover_doublet_float( float *out, complex_float *in ) { - - int index, d; - float sign = 0.0; - for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { - for ( int j=0; j<6; j++ ) { - for ( int i=0; i i+k ) { - // upper triangle - index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1)); - sign = 1.0; - } else { - // lower triangle, j < i+k - index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1)); - sign = -1.0; - } - } else { - // i+k >= 6 - // second 6-by-6 matrix - if ( j > i+k-6 ) { - // upper triangle - index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1)); - sign = 1.0; - } else { - // j < i+k-6 - // lower triangle - index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1)); - sign = -1.0; - } - } - d=(i+k<6)?0:6; - out[ sse_clover_real_index(i+k+d,j) ] = creal_float( in[index] ); - out[ sse_clover_imag_index(i+k+d,j) ] = sign*cimag_float( in[index] ); - out[ sse_clover_real_index(i+k+d+6,j) ] = creal_float( in[index] ); - out[ sse_clover_imag_index(i+k+d+6,j) ] = sign*cimag_float( in[index] ); - } - } - } -} - -void sse_add_diagonal_clover_double( double *out, complex_double *diag ) { } - -void sse_add_diagonal_clover_float( float *out, complex_float *diag ) { - for ( int k=0; k<12; k++ ) { - out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] ); - out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] ); - } -} - -void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag ) { } - -void sse_add_diagonal_clover_doublet_float( float *out, complex_float *diag ) { - for ( int k=0; k<6; k++ ) { - out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] ); - out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] ); - out[ sse_clover_real_index(k+6,k%6) ] -= creal_float( diag[k] ); - out[ sse_clover_imag_index(k+6,k%6) ] -= cimag_float( diag[k] ); - } - for ( int k=6; k<12; k++ ) { - out[ sse_clover_real_index(k+6,k%6) ] += creal_float( diag[k] ); - out[ sse_clover_imag_index(k+6,k%6) ] += cimag_float( diag[k] ); - out[ sse_clover_real_index(k+12,k%6) ] -= creal_float( diag[k] ); - out[ sse_clover_imag_index(k+12,k%6) ] -= cimag_float( diag[k] ); - } -} - -void sse_site_clover_double( double *eta, const double *phi, const double *clover ) { - -} - -void sse_site_clover_float( float *eta, const float *phi, float *clover ) { - - __m128 in_re; - __m128 in_im; - - __m128 clov_re; - __m128 clov_im; - - __m128 out_re; - __m128 out_im; - -#ifdef HAVE_TM1p1 - if( g.n_flavours == 2 ) { - // lines 1--4; indeces from 0 to 47 - in_re = _mm_set1_ps( phi[0] ); - in_im = _mm_set1_ps( phi[1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i] ); - in_im = _mm_set1_ps( phi[2*i+1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta ); - - // lines 5--8; indeces from 48 to 95 - in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] ); - in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] ); - in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+8 ); - - // lines 9--12; indeces from 96 to 143 - in_re = _mm_set1_ps( phi[12] ); - in_im = _mm_set1_ps( phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+12] ); - in_im = _mm_set1_ps( phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+16 ); - - // lines 13--16; indeces from 144 to 191 - in_re = _mm_set1_ps( phi[24] ); - in_im = _mm_set1_ps( phi[25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+24] ); - in_im = _mm_set1_ps( phi[2*i+25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+24 ); - - // lines 17--20; indeces from 192 to 239 - in_re = _mm_setr_ps( phi[24], phi[24], phi[36], phi[36] ); - in_im = _mm_setr_ps( phi[25], phi[25], phi[37], phi[37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_setr_ps( phi[2*i+24], phi[2*i+24], phi[2*i+36], phi[2*i+36] ); - in_im = _mm_setr_ps( phi[2*i+25], phi[2*i+25], phi[2*i+37], phi[2*i+37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+32 ); - - // lines 21--24; indeces from 240 to 287 - in_re = _mm_set1_ps( phi[36] ); - in_im = _mm_set1_ps( phi[37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+36] ); - in_im = _mm_set1_ps( phi[2*i+37] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+40 ); - - } else { -#endif - // lines 1--4; indeces from 0 to 47 - in_re = _mm_set1_ps( phi[0] ); - in_im = _mm_set1_ps( phi[1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i] ); - in_im = _mm_set1_ps( phi[2*i+1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta ); - - // lines 5--8; indeces from 48 to 95 - in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] ); - in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] ); - in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+8 ); - - // lines 9--12; indeces from 96 to 143 - in_re = _mm_set1_ps( phi[12] ); - in_im = _mm_set1_ps( phi[13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<6; i++ ) { - in_re = _mm_set1_ps( phi[2*i+12] ); - in_im = _mm_set1_ps( phi[2*i+13] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta+16 ); -#ifdef HAVE_TM1p1 - } -#endif - -} - -void sse_site_clover_doublet_double( double *eta, const double *phi, const double *clover ) { - -} - -void sse_site_clover_doublet_float( float *eta, const float *phi, float *clover ) { - - __m128 in_re; - __m128 in_im; - - __m128 clov_re; - __m128 clov_im; - - __m128 out_re; - __m128 out_im; - - // lines 1--4; indeces from 0 to 47 - // lines 5--8; indeces from 48 to 95 - // lines 9--12; indeces from 96 to 143 - for( int n=0; n<3; n++ ) { - in_re = _mm_set1_ps( phi[0] ); - in_im = _mm_set1_ps( phi[1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<12; i++ ) { - in_re = _mm_set1_ps( phi[2*i] ); - in_im = _mm_set1_ps( phi[2*i+1] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta + n*8 ); - } - - - // lines 13--16; indeces from 144 to 191 - // lines 17--20; indeces from 192 to 239 - // lines 21--24; indeces from 240 to 287 - for( int n=3; n<6; n++ ) { - in_re = _mm_set1_ps( phi[24] ); - in_im = _mm_set1_ps( phi[25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - - for ( int i=1; i<12; i++ ) { - in_re = _mm_set1_ps( phi[2*i+24] ); - in_im = _mm_set1_ps( phi[2*i+25] ); - clov_re = _mm_load_ps( clover ); - clov_im = _mm_load_ps( clover+SIMD_LENGTH_float ); - cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im ); - clover+=2*SIMD_LENGTH_float; - } - - sse_complex_interleaved_store( out_re, out_im, eta + n*8 ); - } -} - - - -void sse_site_clover_invert_double( double *clover_in, double *clover_out ) { } - -void sse_site_clover_invert_float( float *clover_in, float *clover_out ) { - - float M_tmp1[72], M_tmp2[72]; - - for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { - for ( int j=0; j<6; j++ ) { - for ( int i=k; i -#include - -// res = a*b + c -static inline __m128d sse_fmadd_pd( __m128d a, __m128d b, __m128d c ) { - __m128d res; - res = _mm_mul_pd( a, b ); - res = _mm_add_pd( res, c ); - return res; -} - -// res = -a*b + c -static inline __m128d sse_fnmadd_pd( __m128d a, __m128d b, __m128d c ) { - __m128d res; - res = _mm_mul_pd( a, b ); - res = _mm_sub_pd( c, res ); - return res; -} - -// res = a*b - c -static inline __m128d sse_fmsub_pd( __m128d a, __m128d b, __m128d c ) { - __m128d res; - res = _mm_mul_pd( a, b ); - res = _mm_sub_pd( res, c ); - return res; -} - -static inline double sse_reduce_add_pd( __m128d data ) { - double result; - data = _mm_add_pd( data, _mm_unpackhi_pd( data, data ) ); - _mm_store_sd( &result, data ); - return result; -} - -#endif -#endif \ No newline at end of file diff --git a/src/sse_float_intrinsic.h b/src/sse_float_intrinsic.h deleted file mode 100644 index 33220ba..0000000 --- a/src/sse_float_intrinsic.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef FLOAT_INTRINSIC_SSE_H -#define FLOAT_INTRINSIC_SSE_H - -#ifdef SSE -#include -#include - -// res = a*b + c -static inline __m128 sse_fmadd( __m128 a, __m128 b, __m128 c ) { - __m128 res; - res = _mm_mul_ps( a, b ); - res = _mm_add_ps( res, c ); - return res; -} - -// res = -a*b + c -static inline __m128 sse_fnmadd( __m128 a, __m128 b, __m128 c ) { - __m128 res; - res = _mm_mul_ps( a, b ); - res = _mm_sub_ps( c, res ); - return res; -} - -// res = a*b - c -static inline __m128 sse_fmsub( __m128 a, __m128 b, __m128 c ) { - __m128 res; - res = _mm_mul_ps( a, b ); - res = _mm_sub_ps( res, c ); - return res; -} - -// res = -a*b - c -static inline __m128 sse_fnmsub( __m128 a, __m128 b, __m128 c ) { - __m128 res; __m128 minus_a; - minus_a = _mm_setzero_ps(); - minus_a = _mm_sub_ps( minus_a, a ); - res = _mm_mul_ps( minus_a, b ); - res = _mm_sub_ps( res, c ); - return res; -} - -static inline void transpose_4_registers( __m128 *data) -{ - __m128 tmp[4]; - - tmp[0] = _mm_unpacklo_ps( data[0], data[1] ); - tmp[1] = _mm_unpacklo_ps( data[2], data[3] ); - tmp[2] = _mm_unpackhi_ps( data[0], data[1] ); - tmp[3] = _mm_unpackhi_ps( data[2], data[3] ); - - data[0] = _mm_movelh_ps( tmp[0], tmp[1] ); - data[1] = _mm_movehl_ps( tmp[1], tmp[0] ); - data[2] = _mm_movelh_ps( tmp[2], tmp[3] ); - data[3] = _mm_movehl_ps( tmp[3], tmp[2] ); -} - - -static inline float sse_reduce_add_ps( __m128 data ) { - float result; - - __m128 tmp; - tmp = _mm_add_ps( data, _mm_movehl_ps( data, data ) ); - data = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) ); - _mm_store_ss( &result, data ); - - return result; -} - -#endif - -#endif // FLOAT_INTRINSIC_SSE_H diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c deleted file mode 100644 index bd5f56a..0000000 --- a/src/sse_interpolation_generic.c +++ /dev/null @@ -1,669 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#if defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) - -void interpolation_PRECISION_alloc( level_struct *l ) { - - int k, n = l->num_eig_vect; - - MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n ); - -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); - l->is_PRECISION.interpolation[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 128 ); - for ( k=1; kis_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size; -#endif - // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size - MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, - ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 ); - - l->is_PRECISION.test_vector[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 ); - for ( k=1; kis_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size; - } -} - - -void interpolation_PRECISION_dummy_alloc( level_struct *l ) { - - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); -} - - -void interpolation_PRECISION_dummy_free( level_struct *l ) { - - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); -} - - -void interpolation_PRECISION_free( level_struct *l ) { - - int n = l->num_eig_vect; - - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size ); - FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n ); -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n ); -#endif - FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); -} - - -void swap8_PRECISION( PRECISION* data ) { - - int i; - PRECISION tmp[8]; - - for ( i=0; i<4; i++ ) { - tmp[i] = data[2*i]; - tmp[i+4] = data[2*i+1]; - } - - for ( i=0; i<8; i++ ) { - data[i] = tmp[i]; - } -} - - -void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) { - - int j, num_eig_vect = l->num_eig_vect; - complex_PRECISION *operator = l->is_PRECISION.operator; - - int start = threading->start_index[l->depth]; - int end = threading->end_index[l->depth]; - - SYNC_CORES(threading) - int offset = SIMD_LENGTH_PRECISION; - for ( j=0; j num_eig_vect) - j_end = num_eig_vect; - - operator = l->is_PRECISION.operator + j*l->vector_size + start*offset; - - for ( int i=start; iis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - -#ifdef HAVE_TM1p1 - if( g.n_flavours==2 ) - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; - float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi1_c_re+j, zero); - _mm_store_ps(tmp_phi1_c_im+j, zero); - _mm_store_ps(tmp_phi2_c_re+j, zero); - _mm_store_ps(tmp_phi2_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jis_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi_c_re+j, zero); - _mm_store_ps(tmp_phi_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jis_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - -#ifdef HAVE_TM1p1 - if( g.n_flavours==2 ) - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; - - float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi1_c_re+j, zero); - _mm_store_ps(tmp_phi1_c_im+j, zero); - _mm_store_ps(tmp_phi2_c_re+j, zero); - _mm_store_ps(tmp_phi2_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jis_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi_c_re+j, zero); - _mm_store_ps(tmp_phi_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jis_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - -#ifdef HAVE_TM1p1 - if( g.n_flavours==2 ) - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - - int offset = SIMD_LENGTH_PRECISION; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; - - // loop over blocks of SIMD_LENGTH_PRECISION vectors - for ( j=0; jis_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; - - // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving - // complex components and masking - // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) - float tmp_phi1_c_re[2*offset]; - float tmp_phi1_c_im[2*offset]; - float tmp_phi2_c_re[2*offset]; - float tmp_phi2_c_im[2*offset]; - __m128 zero = _mm_setzero_ps(); - for ( k1=0; k1<2*offset; k1+=offset ) { - _mm_store_ps(tmp_phi1_c_re+k1, zero); - _mm_store_ps(tmp_phi1_c_im+k1, zero); - _mm_store_ps(tmp_phi2_c_re+k1, zero); - _mm_store_ps(tmp_phi2_c_im+k1, zero); - } - - for ( k=0; k broadcast - __m128 phi1_re = _mm_set1_ps(((float *)phi_pt)[0]); - __m128 phi1_im = _mm_set1_ps(((float *)phi_pt)[1]); - __m128 phi2_re = _mm_set1_ps(((float *)phi_pt)[0+2*num_parent_eig_vect]); - __m128 phi2_im = _mm_set1_ps(((float *)phi_pt)[1+2*num_parent_eig_vect]); - - __m128 operator_re = _mm_load_ps((float *)operator); - __m128 operator_im = _mm_load_ps((float *)operator+offset); - __m128 phi1_c_re = _mm_load_ps(tmp_phi1_c_re+low_high_offset); - __m128 phi1_c_im = _mm_load_ps(tmp_phi1_c_im+low_high_offset); - __m128 phi2_c_re = _mm_load_ps(tmp_phi2_c_re+low_high_offset); - __m128 phi2_c_im = _mm_load_ps(tmp_phi2_c_im+low_high_offset); - - cfmadd_conj(operator_re, operator_im, phi1_re, phi1_im, &phi1_c_re, &phi1_c_im); - cfmadd_conj(operator_re, operator_im, phi2_re, phi2_im, &phi2_c_re, &phi2_c_im); - - _mm_store_ps(tmp_phi1_c_re+low_high_offset, phi1_c_re); - _mm_store_ps(tmp_phi1_c_im+low_high_offset, phi1_c_im); - _mm_store_ps(tmp_phi2_c_re+low_high_offset, phi2_c_re); - _mm_store_ps(tmp_phi2_c_im+low_high_offset, phi2_c_im); - // skip to next real line of matrix - operator += offset; - phi_pt++; - } - phi_pt += num_parent_eig_vect; - low_high_offset = offset; - } - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+j+m))[0] = tmp_phi1_c_re[m]; - ((float*)(phi_c_pt+j+m))[1] = tmp_phi1_c_im[m]; - } - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi2_c_re[m]; - ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi2_c_im[m]; - } - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+2*num_eig_vect+j+m))[0] = tmp_phi1_c_re[m+offset]; - ((float*)(phi_c_pt+2*num_eig_vect+j+m))[1] = tmp_phi1_c_im[m+offset]; - } - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+3*num_eig_vect+j+m))[0] = tmp_phi2_c_re[m+offset]; - ((float*)(phi_c_pt+3*num_eig_vect+j+m))[1] = tmp_phi2_c_im[m+offset]; - } - } - } - else -#endif - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - - int offset = SIMD_LENGTH_PRECISION; - // loop over blocks of SIMD_LENGTH_PRECISION vectors - for ( j=0; jnext_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving - // complex components and masking - // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) - float tmp_phi_c_re[2*offset]; - float tmp_phi_c_im[2*offset]; - __m128 zero = _mm_setzero_ps(); - for ( k1=0; k1<2*offset; k1+=offset ) { - _mm_store_ps(tmp_phi_c_re+k1, zero); - _mm_store_ps(tmp_phi_c_im+k1, zero); - } - - for ( k=0; k broadcast - __m128 phi_re = _mm_set1_ps(((float *)phi_pt)[0]); - __m128 phi_im = _mm_set1_ps(((float *)phi_pt)[1]); - - __m128 operator_re = _mm_load_ps((float *)operator); - __m128 operator_im = _mm_load_ps((float *)operator+offset); - __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+low_high_offset); - __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+low_high_offset); - - cfmadd_conj(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im); - - _mm_store_ps(tmp_phi_c_re+low_high_offset, phi_c_re); - _mm_store_ps(tmp_phi_c_im+low_high_offset, phi_c_im); - // skip to next real line of matrix - operator += offset; - phi_pt++; - } - low_high_offset = offset; - } - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m]; - ((float*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m]; - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset]; - ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset]; - } - } - } - - SYNC_HYPERTHREADS(threading) - START_LOCKED_MASTER(threading) - vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level ); - END_LOCKED_MASTER(threading) - PROF_PRECISION_STOP( _PR, 1, threading ); -} - -#endif // defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) diff --git a/src/sse_interpolation_generic.h b/src/sse_interpolation_generic.h deleted file mode 100644 index 2db7a86..0000000 --- a/src/sse_interpolation_generic.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_INTERPOLATION_PRECISION_HEADER - #define SSE_INTERPOLATION_PRECISION_HEADER - - #ifdef SSE - void interpolation_PRECISION_alloc( level_struct *l ); - void interpolation_PRECISION_free( level_struct *l ); - void interpolation_PRECISION_dummy_alloc( level_struct *l ); - void interpolation_PRECISION_dummy_free( level_struct *l ); - - void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); - void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading ); -#endif - -#endif \ No newline at end of file diff --git a/src/sse_linalg.c b/src/sse_linalg.c deleted file mode 100644 index bf0f9d6..0000000 --- a/src/sse_linalg.c +++ /dev/null @@ -1,795 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#ifdef OPTIMIZED_LINALG_double -void vector_double_scale( vector_double z, vector_double x, complex_double alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_double_START( _LA6 ); - - __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); - __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - double *zd = (double*)(z+start); - double *xd = (double*)(x+start); - - for( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_scale( vector_float z, vector_float x, complex_float alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_float_START( _LA6 ); - - __m128 alpha_re = _mm_set1_ps( creal_float(alpha) ); - __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) ); - float *zf = (float*)(z+start); - float *xf = (float*)(x+start); - - if ( l->depth == 0 ) { - for( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_saxpy( vector_float z, vector_float x, vector_float y, complex_float alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_float_START( _LA8 ); - - __m128 alpha_re = _mm_set1_ps( creal_float(alpha) ); - __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) ); - - if ( l->depth == 0 ) { - for ( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void vector_double_saxpy( vector_double z, vector_double x, vector_double y, complex_double alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_double_START( _LA8 ); - - __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); - __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - - for ( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -complex_double global_inner_product_double( vector_double phi, vector_double psi, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _GIP, threading ); - complex_double local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128d alpha_re = _mm_setzero_pd(); - __m128d alpha_im = _mm_setzero_pd(); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_double *)threading->workspace)[0] += ((complex_double *)threading->workspace)[i]; - local_alpha = ((complex_double *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_double_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm ); - PROF_double_STOP( _ALLR, 1 ); - ((complex_double *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((complex_double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return global_alpha; - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((complex_double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return local_alpha; - } -} -#endif - -#ifdef OPTIMIZED_LINALG_float -complex_float global_inner_product_float( vector_float phi, vector_float psi, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GIP, threading ); - complex_float local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128 alpha_re = _mm_setzero_ps(); - __m128 alpha_im = _mm_setzero_ps(); - - float *phif = (float*)(phi+thread_start); - float *psif = (float*)(psi+thread_start); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_float *)threading->workspace)[0] += ((complex_float *)threading->workspace)[i]; - local_alpha = ((complex_float *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_float_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); - PROF_float_STOP( _ALLR, 1 ); - ((complex_float *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((complex_float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return global_alpha; - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((complex_float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return local_alpha; - } -} -#endif - -#ifdef OPTIMIZED_LINALG_double -double global_norm_double( vector_double x, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _GIP, threading ); - - double local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - VECTOR_FOR( int i=thread_start, iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((double *)threading->workspace)[0] += ((double *)threading->workspace)[i]; - local_alpha = ((double *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_double_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm ); - PROF_double_STOP( _ALLR, 1 ); - ((double *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (double)sqrt((double)global_alpha); - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (double)sqrt((double)local_alpha); - } -} -#endif - -#ifdef OPTIMIZED_LINALG_float -float global_norm_float( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GIP, threading ); - - float local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128 alpha = _mm_setzero_ps(); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((float *)threading->workspace)[0] += ((float *)threading->workspace)[i]; - local_alpha = ((float *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_float_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); - PROF_float_STOP( _ALLR, 1 ); - ((float *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (float)sqrt((double)global_alpha); - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (float)sqrt((double)local_alpha); - } -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha, - int sign, int count, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_double_START( _LA8 ); - - int flag = 0; - __m128d alpha_re[count]; __m128d alpha_im[count]; - for ( int c=0; c EPS_double || -cimag_double(alpha[c]) > EPS_double ) - flag = 1; - } - - if ( flag == 0 ) { - for ( int c=0; c EPS_float || -cimag_float(alpha[c]) > EPS_float ) - flag = 1; - } - - if ( l->depth == 0 ) { - if ( flag == 0 ) { - for ( int c=0; cworkspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void process_multi_inner_product_float( int count, complex_float *results, vector_float *phi, vector_float psi, - int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _PIP, threading ); - int i; - for(int c=0; cdepth == 0 ) { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); - for(int c=0; cworkspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_float **)threading->workspace)[0][c] += ((complex_float **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_float_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void process_multi_inner_product_double( int count, complex_double *results, vector_double *phi, vector_double psi, - int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _PIP, threading ); - int i; - for(int c=0; cdepth == 0 ) { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); - for(int c=0; cworkspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_double_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading ); -} -#endif - -#endif // SSE - diff --git a/src/sse_linalg.h b/src/sse_linalg.h deleted file mode 100644 index cd88fad..0000000 --- a/src/sse_linalg.h +++ /dev/null @@ -1,497 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef LINALG_SSE_H -#define LINALG_SSE_H -#ifdef SSE - - -// Standard Gram-Schmidt on aggregates -static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, - level_struct *l, struct Thread *threading ); -// Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt -static inline void sse_aggregate_gram_schmidt_block_float( float *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); -// used by Block-Gram-Schmidt -static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); -// used by Block-Gram-Schmidt -static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - -static inline void sse_aggregate_gram_schmidt_double( complex_double *V, const int num_vec, - level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_gram_schmidt_block_double( double *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_block_dot_block_double( double *S, double *U, double *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_block_minus_block_times_dot_double( double *B, double *U, double *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} - - -static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading ); - SYNC_CORES(threading) - SYNC_HYPERTHREADS(threading) - long int i, j, k, k1, k2, k3, num_aggregates = l->s_float.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - float *v_pt1; - float *v_pt2; - float norm1, norm2; - float next_norm1; - float next_norm2; - int ldv = SIMD_LENGTH_float; - int V_block_offset = 2*l->vector_size; - - for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { - - v_pt1 = (float *)V + 0 + j*aggregate_size*2*ldv; - - next_norm1 = 0.0; - next_norm2 = 0.0; - for ( i=0; is_float.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - float *v_pt1; - float *v_pt2; - float norm; - float next_norm; - int ldv = leading_dimension; - //offset = 6; - - - // current thread chooses an aggregate - for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) { - j = jp/2; - int component = jp%2; - - - v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv; - - next_norm = 0.0; - - // for the whole aggregate - for ( i=0; is_float.num_aggregates; - int aggregate_size = l->inner_vector_size / num_aggregates; - int offset = l->num_lattice_site_var/2; - - for ( int jp=threading->core; jpn_core ) { - int j = jp/2; - int component = jp%2; - // factors 2 are for complex and spin01/23 aggregates - Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - __m128 U_re; - __m128 U_im; - __m128 B_re; - __m128 B_im; - __m128 S_re[SIMD_LENGTH_float]; - __m128 S_im[SIMD_LENGTH_float]; - for( int i=0; is_float.num_aggregates; - int aggregate_size = l->inner_vector_size / num_aggregates; - int offset = l->num_lattice_site_var/2; - - for ( int jp=threading->core; jpn_core ) { - int j = jp/2; - int component = jp%2; - // factors 2 are for complex and spin01/23 aggregates - Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - __m128 U_re; - __m128 U_im; - __m128 B_re; - __m128 B_im; - __m128 S_re[SIMD_LENGTH_float]; - __m128 S_im[SIMD_LENGTH_float]; - for( int i=0; ivector_size), - (PRECISION *)(V + j*l->vector_size), vecs, l, threading ); - aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading ); - } - SYNC_CORES(threading) - PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading ); -} - - -void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) { - - // the block version has some optimizations which are correct only on the fine grid - if(l->depth == 0) - aggregate_block_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading); - else - aggregate_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading); -} - - -void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, int num_vec, level_struct *l, struct Thread *threading ) { - START_NO_HYPERTHREADS(threading) - - PRECISION *S = NULL; - START_LOCKED_MASTER(threading) - // factors 2 are for complex and spin01/23 aggregates - MALLOC_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64); - ((PRECISION **)threading->workspace)[0] = S; - END_LOCKED_MASTER(threading) - S = ((PRECISION **)threading->workspace)[0]; - - aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading); - aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading); - - START_LOCKED_MASTER(threading) - FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION); - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -} - - -void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) { - sse_aggregate_block_dot_block_PRECISION( S, U, B, num_vec, leading_dimension, l, threading ); -} - - -void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) { - sse_aggregate_block_minus_block_times_dot_PRECISION( B, U, S, num_vec, leading_dimension, l, threading ); -} - -#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION -void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*16; - - for(int i=0; i<2*offset; i++) - thread_buffer[i] = 0.0; - - SYNC_CORES(threading) - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - __m128 dot_re[count]; - __m128 dot_im[count]; - __m128 dot_gamma5_re[count]; - __m128 dot_gamma5_im[count]; - - for ( int j=0; j can use 3 pre-defined +/-1 patterns - __m128 gamma5[3]; - gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 ); - gamma5[1] = _mm_set_ps( 1.0, 1.0,-1.0,-1.0 ); - gamma5[2] = _mm_set_ps( 1.0, 1.0, 1.0, 1.0 ); - - for(int m=0; m<3; m++) { - - sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im ); - sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im ); - - gamma5_v_re = _mm_mul_ps(gamma5[m], v_re); - gamma5_v_im = _mm_mul_ps(gamma5[m], v_im); - - cfmadd_conj(vj_re, vj_im, v_re, v_im, dot_re+j, dot_im+j); - cfmadd_conj(vj_re, vj_im, gamma5_v_re, gamma5_v_im, dot_gamma5_re+j, dot_gamma5_im+j); - } - } - } - } - for ( int j=0; jworkspace)[threading->core] = thread_buffer; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) { - for(int j=0; jworkspace)[0][j] += ((complex_PRECISION **)threading->workspace)[i][j]; - ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset]; - } - } - END_MASTER(threading) - // only master needs the result in this case (it will be distributed later) -} -#endif - -#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION -void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*16; - - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - __m128 dot_re[count]; - __m128 dot_im[count]; - __m128 dot_gamma5_re[count]; - __m128 dot_gamma5_im[count]; - - for ( int j=0; j can use 3 pre-defined +/-1 patterns - __m128 gamma5[3]; - gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 ); - gamma5[1] = _mm_set_ps( 1.0, 1.0,-1.0,-1.0 ); - gamma5[2] = _mm_set_ps( 1.0, 1.0, 1.0, 1.0 ); - - for(int m=0; m<3; m++) { - - sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im ); - sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im ); - - gamma5_vj_re = _mm_mul_ps(gamma5[m], vj_re); - gamma5_vj_im = _mm_mul_ps(gamma5[m], vj_im); - - cfnmadd(vj_re, vj_im, dot_re[j], dot_im[j], &v_re, &v_im); - cfnmadd(gamma5_vj_re, gamma5_vj_im, dot_gamma5_re[j], dot_gamma5_im[j], &v_re, &v_im); - - sse_complex_interleaved_store(v_re, v_im, (float*)(V[count]+i+k+4*m) ); - } - } - } - } -} -#endif - -#endif diff --git a/src/sse_linalg_generic.h b/src/sse_linalg_generic.h deleted file mode 100644 index 00390d5..0000000 --- a/src/sse_linalg_generic.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_LINALG_PRECISION_HEADER - #define SSE_LINALG_PRECISION_HEADER - #ifdef SSE - - void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - // Block-Gram-Schmidt on aggregates - void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - // Standard Gram-Schmidt on aggregates - void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - - // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt - void aggregate_gram_schmidt_block_PRECISION( PRECISION *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, - int num_vec, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - - void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading); - - void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading); - -#endif -#endif \ No newline at end of file diff --git a/src/threading.c b/src/threading.c index d793c3a..aa731f9 100644 --- a/src/threading.c +++ b/src/threading.c @@ -116,12 +116,8 @@ void setup_no_threading(struct Thread *no_threading, struct level_struct *l) void compute_core_start_end(int start, int end, int *core_start, int *core_end, struct level_struct *l, struct Thread *threading) { -#ifdef SSE - int min_per_core = 2*l->num_lattice_site_var; -#else // due to loop unrolling in low level functions int min_per_core = 3*40; -#endif // printf0("min_per_core = %d\n", min_per_core ); compute_core_start_end_custom(start, end, core_start, core_end, l, threading, min_per_core); } diff --git a/src/top_level.c b/src/top_level.c index 68fa204..5e04827 100644 --- a/src/top_level.c +++ b/src/top_level.c @@ -21,25 +21,29 @@ #include "main.h" -void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) { +void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ) { // no hyperthreading here if(threading->thread != 0) return; - int start = threading->start_index[l->depth]; - int end = threading->end_index[l->depth]; + //int start = threading->start_index[l->depth]; + //int end = threading->end_index[l->depth]; if ( g.rhs == 0 ) { - vector_double_define( rhs, 1, start, end, l ); + //vector_double_define( rhs, 1, start, end, l ); + vector_double_define_new( rhs, 1, l, threading ); START_MASTER(threading) if ( g.print > 0 ) printf0("rhs = ones\n"); END_MASTER(threading) } else if ( g.rhs == 1 ) { - vector_double_define( rhs, 0, start, end, l ); + //vector_double_define( rhs, 0, start, end, l ); + vector_double_define_new( rhs, 0, l, threading ); if ( g.my_rank == 0 ) { START_LOCKED_MASTER(threading) - rhs[0] = 1.0; + //rhs->vector_buffer[0] = 1.0; + for ( int i=0; inum_vect; i++ ) + rhs->vector_buffer[i*(rhs->size)] = 1.0; END_LOCKED_MASTER(threading) } START_MASTER(threading) @@ -48,13 +52,15 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) } else if ( g.rhs == 2 ) { // this would yield different results if we threaded it, so we don't START_LOCKED_MASTER(threading) - vector_double_define_random( rhs, 0, l->inner_vector_size, l ); + //vector_double_define_random( rhs, 0, l->inner_vector_size, l ); + vector_double_define_random_new( rhs, l, threading ); END_LOCKED_MASTER(threading) START_MASTER(threading) if ( g.print > 0 ) printf0("rhs = random\n"); END_MASTER(threading) } else if ( g.rhs == 3 ) { - vector_double_define( rhs, 0, start, end, l ); + //vector_double_define( rhs, 0, start, end, l ); + vector_double_define_new( rhs, 0, l, threading ); } else { ASSERT( g.rhs >= 0 && g.rhs <= 4 ); } @@ -62,10 +68,9 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) } -int wilson_driver( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) { - - int iter = 0, start = threading->start_index[l->depth], end = threading->end_index[l->depth]; +int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) { + int iter = 0; //, start = threading->start_index[l->depth], end = threading->end_index[l->depth]; vector_double rhs = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.b:g.p.b; vector_double sol = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.x:g.p.x; @@ -78,8 +83,11 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l for ( int i=0; i<100; i++ ) { double tmp_t = -MPI_Wtime(); #endif - - vector_double_copy( rhs, source, start, end, l ); + vector_double_change_layout( &sol, &sol, _LV_SV_NV, no_threading ); + vector_double_change_layout( &rhs, &rhs, _LV_SV_NV, no_threading ); + + //vector_double_copy( &rhs, source, start, end, l ); + vector_double_copy_new( &rhs, source, l, threading ); if ( g.method == -1 ) { cgn_double( &(g.p), l, threading ); } else if ( g.mixed_precision == 2 ) { @@ -87,7 +95,9 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l } else { iter = fgmres_double( &(g.p), l, threading ); } - vector_double_copy( solution, sol, start, end, l ); + //vector_double_copy( solution, &sol, start, end, l ); + vector_double_copy_new( solution, &sol, l, threading ); + #ifdef WILSON_BENCHMARK tmp_t += MPI_Wtime(); if ( tmp_t < t_min ) @@ -101,17 +111,21 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l END_MASTER(threading) #endif + vector_double_change_layout( &sol, &sol, _NV_LV_SV, no_threading ); + vector_double_change_layout( &rhs, &rhs, _NV_LV_SV, no_threading ); + return iter; } -void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) { +void solve( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) { if ( g.vt.evaluation ) { vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b; // this would yield different results if we threaded it, so we don't START_LOCKED_MASTER(threading) - vector_double_define_random( rhs, 0, l->inner_vector_size, l ); + //vector_double_define_random( &rhs, 0, l->inner_vector_size, l ); + vector_double_define_random_new( &rhs, l, threading ); scan_var( &(g.vt), l ); END_LOCKED_MASTER(threading) } else { @@ -122,8 +136,11 @@ void solve( vector_double solution, vector_double source, level_struct *l, struc void solve_driver( level_struct *l, struct Thread *threading ) { - vector_double solution = NULL, source = NULL; - double minus_twisted_bc[4], norm; + vector_double solution, source; + double minus_twisted_bc[4], norm[g.num_rhs_vect]; + + vector_double_init( &solution ); + vector_double_init( &source ); if(g.bc==2) for ( int i=0; i<4; i++ ) @@ -135,55 +152,66 @@ void solve_driver( level_struct *l, struct Thread *threading ) { printf0("inverting doublet operator\n"); } #endif - PUBLIC_MALLOC( solution, complex_double, l->inner_vector_size ); - PUBLIC_MALLOC( source, complex_double, l->inner_vector_size ); + vector_double_alloc( &solution, _INNER, g.num_rhs_vect, l, threading ); + vector_double_alloc( &source, _INNER, g.num_rhs_vect, l, threading ); - rhs_define( source, l, threading ); + rhs_define( &source, l, threading ); + + vector_double_change_layout( &solution, &solution, _LV_SV_NV, no_threading ); + vector_double_change_layout( &source, &source, _LV_SV_NV, no_threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( source, source, g.twisted_bc, l); - - norm = global_norm_double( source, 0, l->inner_vector_size, l, threading ); - printf0("source vector norm: %le\n",norm); + apply_twisted_bc_to_vector_double_new( &source, &source, g.twisted_bc, l); + global_norm_double_new( norm, &source, l, threading ); + for( int i=0; iinner_vector_size, l, threading ); + printf0("source vector %d norm: %le\n",i,norm[i]); + } #ifdef HAVE_TM1p1 if( g.n_flavours == 1 ) #endif #ifdef HAVE_TM - if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) - if(g.downprop) { + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + if(g.downprop) { - START_MASTER(threading) - printf0("\n\n+--------------------------- up ---------------------------+\n\n"); - END_MASTER(threading) + START_MASTER(threading) + printf0("\n\n+--------------------------- up ---------------------------+\n\n"); + END_MASTER(threading) - solve( solution, source, l, threading ); + solve( &solution, &source, l, threading ); - if(g.bc==2) - apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); + if(g.bc==2) + apply_twisted_bc_to_vector_double_new( &solution, &solution, minus_twisted_bc, l); - START_LOCKED_MASTER(threading) - printf0("\n\n+-------------------------- down --------------------------+\n\n"); - g.mu*=-1; - g.mu_odd_shift*=-1; - g.mu_even_shift*=-1; - END_LOCKED_MASTER(threading) + START_LOCKED_MASTER(threading) + printf0("\n\n+-------------------------- down --------------------------+\n\n"); + g.mu*=-1; + g.mu_odd_shift*=-1; + g.mu_even_shift*=-1; + END_LOCKED_MASTER(threading) - tm_term_update( g.mu, l, threading ); - finalize_operator_update( l, threading ); - } + tm_term_update( g.mu, l, threading ); + finalize_operator_update( l, threading ); + } #endif - solve( solution, source, l, threading ); + solve( &solution, &source, l, threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); + apply_twisted_bc_to_vector_double_new( &solution, &solution, minus_twisted_bc, l); + + global_norm_double_new( norm, &solution, l, threading ); + for( int i=0; iinner_vector_size, l, threading ); + printf0("solution vector %d norm: %le\n",i,norm[i]); + } - norm = global_norm_double( solution, 0, l->inner_vector_size, l, threading ); - printf0("solution vector norm: %le\n",norm); + vector_double_change_layout( &solution, &solution, _NV_LV_SV, no_threading ); + vector_double_change_layout( &source, &source, _NV_LV_SV, no_threading ); - PUBLIC_FREE( solution, complex_double, l->inner_vector_size ); - PUBLIC_FREE( source, complex_double, l->inner_vector_size ); + vector_double_free( &solution, l, threading ); + vector_double_free( &source, l, threading ); #ifdef HAVE_TM1p1 if( g.n_flavours == 2 ) diff --git a/src/top_level.h b/src/top_level.h index cc4b029..a281daa 100644 --- a/src/top_level.h +++ b/src/top_level.h @@ -24,9 +24,9 @@ struct Thread; - void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ); - int wilson_driver( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ); - void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ); + void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ); + int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ); + void solve( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ); void solve_driver( level_struct *l, struct Thread *threading ); #endif diff --git a/src/var_table.h b/src/var_table.h index abb321c..dbc6b2a 100644 --- a/src/var_table.h +++ b/src/var_table.h @@ -33,18 +33,19 @@ warning0("SCAN_VAR does not support threading, yet.\n"); \ kind *tmp_var = (kind*)(var_pt); \ kind signum = (start_valinner_vector_size ); \ + vector_double_alloc( &v, _INNER, 1, l, no_threading ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ - vector_double_copy( v, x, 0, l->inner_vector_size, l ); \ - norm_v = global_norm_double( v, 0, l->inner_vector_size, l, no_threading ); \ + vector_double_copy( &v, &x, 0, l->inner_vector_size, l ); \ + norm_v = global_norm_double( &v, 0, l->inner_vector_size, l, no_threading ); \ } \ \ for ( *tmp_var = (kind)start_val; signum*(*tmp_var) <= signum*((kind)end_val) + EPS_double; \ @@ -68,32 +69,32 @@ } \ printf0("scanning variable \"%s\", value: %lf, run %d of %d\n", name, (double)(*tmp_var), i+1, g.vt.average_over ); \ if ( g.vt.track_error ) { \ - apply_operator_double( b, v, &(g.p), l, no_threading ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + apply_operator_double( &b, &v, &(g.p), l, no_threading ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ if ( g.vt.track_cgn_error ) { \ ASSERT( g.method >=0 && g.p.restart_length >= 4 ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ cgn_double( &(g.p), l, no_threading ); \ - vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \ - g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ + vector_double_minus( &x, &x, &v, 0, l->inner_vector_size, l ); \ + g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ printf0("CGN: error norm: %le\n", g.vt.p_end->values[_CGNR_ERR] ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ } \ } else {\ - rhs_define( b, l, no_threading );\ + rhs_define( &b, l, no_threading );\ } \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ if ( i == g.vt.average_over-1 ) prof_print( l ); \ if ( g.vt.track_error ) { \ - vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \ - g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ + vector_double_minus( &x, &x, &v, 0, l->inner_vector_size, l ); \ + g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ } \ } \ } \ if ( g.vt.track_error ) { \ - FREE( v, complex_double, l->inner_vector_size ); \ + vector_double_free( &v, l, no_threading ); \ } \ tt1 = MPI_Wtime(); \ printf0("\n\ntotal time for parameter scan: %d minutes and %d seconds\n", \ diff --git a/src/vcycle_generic.c b/src/vcycle_generic.c index 038a8fa..cfabc5b 100644 --- a/src/vcycle_generic.c +++ b/src/vcycle_generic.c @@ -22,10 +22,10 @@ #include "main.h" #include "vcycle_PRECISION.h" -void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, +void smoother_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int n, const int res, level_struct *l, struct Thread *threading ) { - ASSERT( phi != eta ); + ASSERT( phi->vector_buffer != eta->vector_buffer ); START_MASTER(threading); PROF_PRECISION_START( _SM ); @@ -47,10 +47,10 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE if ( g.method == 4 || g.method == 6 ) { if ( g.odd_even ) { if ( res == _RES ) { - apply_operator_PRECISION( l->sp_PRECISION.x, phi, &(l->p_PRECISION), l, threading ); - vector_PRECISION_minus( l->sp_PRECISION.x, eta, l->sp_PRECISION.x, start, end, l ); + apply_operator_PRECISION( &(l->sp_PRECISION.x), phi, &(l->p_PRECISION), l, threading ); + vector_PRECISION_minus( &(l->sp_PRECISION.x), eta, &(l->sp_PRECISION.x), start, end, l ); } - block_to_oddeven_PRECISION( l->sp_PRECISION.b, res==_RES?l->sp_PRECISION.x:eta, l, threading ); + block_to_oddeven_PRECISION( &(l->sp_PRECISION.b), res==_RES?&(l->sp_PRECISION.x):eta, l, threading ); START_LOCKED_MASTER(threading) l->sp_PRECISION.initial_guess_zero = _NO_RES; END_LOCKED_MASTER(threading) @@ -62,21 +62,21 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE else coarse_solve_odd_even_PRECISION( &(l->sp_PRECISION), &(l->oe_op_PRECISION), l, threading ); } if ( res == _NO_RES ) { - oddeven_to_block_PRECISION( phi, l->sp_PRECISION.x, l, threading ); + oddeven_to_block_PRECISION( phi, &(l->sp_PRECISION.x), l, threading ); } else { - oddeven_to_block_PRECISION( l->sp_PRECISION.b, l->sp_PRECISION.x, l, threading ); - vector_PRECISION_plus( phi, phi, l->sp_PRECISION.b, start, end, l ); + oddeven_to_block_PRECISION( &(l->sp_PRECISION.b), &(l->sp_PRECISION.x), l, threading ); + vector_PRECISION_plus( phi, phi, &(l->sp_PRECISION.b), start, end, l ); } } else { START_LOCKED_MASTER(threading) - l->sp_PRECISION.x = phi; l->sp_PRECISION.b = eta; + l->sp_PRECISION.x = *phi; l->sp_PRECISION.b = *eta; END_LOCKED_MASTER(threading) fgmres_PRECISION( &(l->sp_PRECISION), l, threading ); } } else if ( g.method == 5 ) { - vector_PRECISION_copy( l->sp_PRECISION.b, eta, start, end, l ); + vector_PRECISION_copy( &(l->sp_PRECISION.b), eta, start, end, l ); bicgstab_PRECISION( &(l->sp_PRECISION), l, threading ); - vector_PRECISION_copy( phi, l->sp_PRECISION.x, start, end, l ); + vector_PRECISION_copy( phi, &(l->sp_PRECISION.x), start, end, l ); } ASSERT( Dphi == NULL ); } @@ -87,19 +87,19 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE } -void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, +void vcycle_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int res, level_struct *l, struct Thread *threading ) { if ( g.interpolation && l->level>0 ) { for ( int i=0; in_cy; i++ ) { if ( i==0 && res == _NO_RES ) { - restrict_PRECISION( l->next_level->p_PRECISION.b, eta, l, threading ); + restrict_PRECISION( &(l->next_level->p_PRECISION.b), eta, l, threading ); } else { int start = threading->start_index[l->depth]; int end = threading->end_index[l->depth]; - apply_operator_PRECISION( l->vbuf_PRECISION[2], phi, &(l->p_PRECISION), l, threading ); - vector_PRECISION_minus( l->vbuf_PRECISION[3], eta, l->vbuf_PRECISION[2], start, end, l ); - restrict_PRECISION( l->next_level->p_PRECISION.b, l->vbuf_PRECISION[3], l, threading ); + apply_operator_PRECISION( &(l->vbuf_PRECISION[2]), phi, &(l->p_PRECISION), l, threading ); + vector_PRECISION_minus( &(l->vbuf_PRECISION[3]), eta, &(l->vbuf_PRECISION[2]), start, end, l ); + restrict_PRECISION( &(l->next_level->p_PRECISION.b), &(l->vbuf_PRECISION[3]), l, threading ); } if ( !l->next_level->idle ) { START_MASTER(threading) @@ -110,7 +110,7 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI if ( g.kcycle ) fgmres_PRECISION( &(l->next_level->p_PRECISION), l->next_level, threading ); else - vcycle_PRECISION( l->next_level->p_PRECISION.x, NULL, l->next_level->p_PRECISION.b, _NO_RES, l->next_level, threading ); + vcycle_PRECISION( &(l->next_level->p_PRECISION.x), NULL, &(l->next_level->p_PRECISION.b), _NO_RES, l->next_level, threading ); } else { if ( g.odd_even ) { if ( g.method == 6 ) { @@ -128,9 +128,9 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI END_MASTER(threading) } if( i == 0 && res == _NO_RES ) - interpolate3_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); + interpolate3_PRECISION( phi, &(l->next_level->p_PRECISION.x), l, threading ); else - interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); + interpolate_PRECISION( phi, &(l->next_level->p_PRECISION.x), l, threading ); smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, l, threading ); res = _RES; } diff --git a/src/vcycle_generic.h b/src/vcycle_generic.h index 5e54a74..8c251f6 100644 --- a/src/vcycle_generic.h +++ b/src/vcycle_generic.h @@ -32,10 +32,10 @@ #include "threading.h" #include "solver_analysis.h" - void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, + void smoother_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int n, const int res, level_struct *l, struct Thread *threading ); - void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, + void vcycle_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta, int res, level_struct *l, struct Thread *threading ); #endif diff --git a/src/vector_generic.c b/src/vector_generic.c new file mode 100644 index 0000000..85276c1 --- /dev/null +++ b/src/vector_generic.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#include "main.h" + +void vector_PRECISION_init( vector_PRECISION *vec ) { + + vec->vector_buffer = NULL; +} + + +void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, Thread *threading ) { + + switch (type){ + case _ORDINARY : + PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->vector_size*num_vect ); + vec->size = l->vector_size; + break; + case _SCHWARZ : + PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*num_vect ); + vec->size = l->schwarz_vector_size; + break; + case _INNER: + PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*num_vect ); + vec->size = l->inner_vector_size; + break; + } + + vec->type = type; + vec->num_vect = num_vect; + vec->layout = _NV_LV_SV; + vec->l = l; +} + + +void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, struct Thread *threading ) { + + switch (vec->type){ + case _ORDINARY : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->vector_size*vec->num_vect ); + break; + case _SCHWARZ : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*vec->num_vect ); + break; + case _INNER : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*vec->num_vect ); + break; + } +} + + +// vector storage for PRECISION precision +void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) { + + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _SET ); + if ( phi->vector_buffer != NULL ) { + int i; + for ( i=start; ivector_buffer[i] = value; + } else { + error0("Error in \"vector_PRECISION_define\": pointer is null\n"); + } + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _SET, 1 ); +} + + +void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading ) { + + int start, end; + compute_core_start_end(0, (phi->size)*(phi->num_vect), &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0) + PROF_PRECISION_START( _SET ); + + if ( phi->vector_buffer != NULL ) { + int i; + for ( i=start; ivector_buffer[i] = value; + } else { + error0("Error in \"vector_PRECISION_define\": pointer is null\n"); + } + if(thread == 0) + PROF_PRECISION_STOP( _SET, 1 ); +} + + +void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, + int start, int end, level_struct *l ) { + + vector_PRECISION_check_comp( z, x ); + //z->layout = x->layout; + + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _RS ); + + PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha); + int r_start = 2*start, r_end = 2*end; + + REAL_VECTOR_FOR( int i=r_start, iinner_vector_size ); +} + + +/* + * opt = 0 : z = alpha*x + * opt = 1 : z = (1/alpha)*x + */ +void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, + int n, int opt, level_struct *l, struct Thread *threading ) { + + //vector_PRECISION_check_comp( z, x ); + + int i, j, jj, start, end; + PRECISION r_alpha[x->num_vect]; + + if(opt){ + VECTOR_LOOP(j, x->num_vect, jj, r_alpha[j+jj]=1.0/creal_PRECISION(alpha[n*x->num_vect+j+jj]);) + }else{ + VECTOR_LOOP(j, x->num_vect, jj, r_alpha[j+jj]=creal_PRECISION(alpha[n*x->num_vect+j+jj]);) + } + + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _RS ); + + //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading ); + //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading ); + if(z == x){ + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] *= r_alpha[j+jj];) + } else { + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = r_alpha[j+jj]*x->vector_buffer[i*x->num_vect+j+jj];) + } + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + + if(thread == 0 && start != end) + PROF_PRECISION_STOP( _RS, (double)(end-start)/(double)l->inner_vector_size ); +} + + + +void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ) { + + if(z == x) return; + + buffer_PRECISION z_pt=z->vector_buffer, x_pt=x->vector_buffer; + int thread = omp_get_thread_num(); + if(thread == 0 && start != end) + PROF_PRECISION_START( _CPY ); + VECTOR_FOR( int i=start, iinner_vector_size ); +} + + +void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading ) { + + if(z == x) return; + + int i, j, jj, start, end; + compute_core_start_end(0, x->size, &start, &end, l, threading); + int thread = omp_get_thread_num(); + if(thread == 0) + PROF_PRECISION_START( _CPY ); + + for( i=start; inum_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj];) + + //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading ); + //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading ); + + if(thread == 0) + PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); +} + + +void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2) { + + if(vec1->num_vect != vec2->num_vect) + error0("Error: The number of vectors have to be the same in both vectors\n"); + + if(vec1->l->level != vec2->l->level) + error0("Error: The level of multigrid must be the same in both vectors\n"); + + if(vec1->type != vec2->type) + error0("Error: The type must be the same in both vectors\n"); + +} + + +void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, struct Thread *threading ) { + + if(vec_in->layout==layout) return; + + vector_PRECISION_check_comp( vec_out, vec_in ); + + int n, i, sv, lv = 0, num_sv = vec_in->l->num_lattice_site_var; + vector_PRECISION vec_tmp; + + if( vec_in->vector_buffer == vec_out->vector_buffer ){ + vector_PRECISION_init( &vec_tmp ); + vector_PRECISION_alloc( &vec_tmp, vec_in->type, vec_in->num_vect, vec_in->l, no_threading ); + } else { + vec_tmp = *vec_out; + } + + switch (vec_in->type){ + case _ORDINARY : + lv = vec_in->l->num_lattice_sites; + break; + case _SCHWARZ : + lv = 2*vec_in->l->num_lattice_sites - vec_in->l->num_inner_lattice_sites; + break; + case _INNER: + lv = vec_in->l->num_inner_lattice_sites; + break; + } + + switch (layout){ + case _NV_LV_SV : + for( n=0; nnum_vect; n++ ) + for( i=0; inum_vect, i, lv, sv, num_sv )] = vec_in->vector_buffer[INDEX_LV_SV_NV( n, vec_in->num_vect, i, lv, sv, num_sv )]; + + vec_out->layout = _NV_LV_SV; + break; + case _LV_SV_NV : + for( i=0; inum_vect; n++ ) + vec_tmp.vector_buffer[INDEX_LV_SV_NV( n, vec_in->num_vect, i, lv, sv, num_sv )] = vec_in->vector_buffer[INDEX_NV_LV_SV( n, vec_in->num_vect, i, lv, sv, num_sv )]; + + vec_out->layout = _LV_SV_NV; + break; + } + + if( vec_in->vector_buffer == vec_out->vector_buffer ){ + vector_PRECISION_copy( vec_out, &vec_tmp, 0, lv*num_sv*vec_out->num_vect, vec_out->l ); + vector_PRECISION_free( &vec_tmp, vec_in->l, no_threading ); + } + +} + +void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ) { + + PRECISION diff = 0; + + vector_PRECISION vp[3]; + + for(int i=0; i<3; i++){ + vector_PRECISION_init( &vp[i] ); + vector_PRECISION_alloc( &vp[i], _ORDINARY, 4, l, threading ); + } + + START_LOCKED_MASTER(threading) + + vector_PRECISION_define_random( &vp[0], 0, 4*l->vector_size, l ); + vector_PRECISION_copy( &vp[1], &vp[0], 0, 4*l->vector_size, l ); + vector_PRECISION_change_layout( &vp[1], &vp[1], _LV_SV_NV, no_threading ); + vector_PRECISION_change_layout( &vp[1], &vp[1], _NV_LV_SV, no_threading ); + vector_PRECISION_minus( &vp[2], &vp[1], &vp[0], 0, 4*l->vector_size, l ); + diff = global_norm_PRECISION( &vp[2], 0, 4*l->vector_size, l, no_threading )/ + global_norm_PRECISION( &vp[0], 0, 4*l->vector_size, l, no_threading ); + + test0_PRECISION("depth: %d, correctness of vector PRECISION change layout: %le\n", l->depth, diff ); + + END_LOCKED_MASTER(threading) + for(int i=0; i<3; i++){ + vector_PRECISION_free( &vp[i], l, threading ); + } + if ( l->level == 0 && g.method == 0) + return; + else + vector_PRECISION_test_routine(l->next_level, threading); +} diff --git a/src/vector_generic.h b/src/vector_generic.h new file mode 100644 index 0000000..901e4a2 --- /dev/null +++ b/src/vector_generic.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#ifndef VECTOR_PRECISION_HEADER + #define VECTOR_PRECISION_HEADER + + struct Thread; + + void vector_PRECISION_init( vector_PRECISION *vec ); + void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, struct Thread *threading ); + void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *threading); + void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ); + void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading ); + void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, + int start, int end, level_struct *l ); + void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, + int n, int opt, level_struct *l, struct Thread *threading ); + void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x + void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading ); + void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2 ); + void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, struct Thread *threading ); + void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ); + +#endif diff --git a/src/vectorization_control.h b/src/vectorization_control.h deleted file mode 100644 index f05a701..0000000 --- a/src/vectorization_control.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef VECTORIZATION_CONTROL_H -#define VECTORIZATION_CONTROL_H - -#ifdef SSE - -#define SIMD_LENGTH_float 4 -#define SIMD_LENGTH_double 2 - -#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float -#define OPTIMIZED_COARSE_SELF_COUPLING_float -#define INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_float -#define INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_float -#define OPTIMIZED_NEIGHBOR_COUPLING_double -#define OPTIMIZED_NEIGHBOR_COUPLING_float -#define OPTIMIZED_SELF_COUPLING_float -#define GRAM_SCHMIDT_VECTORIZED_float -#define OPTIMIZED_LINALG_float -#define OPTIMIZED_LINALG_double - -#include "sse_complex_float_intrinsic.h" -#include "sse_complex_double_intrinsic.h" - -#endif - -#define OPERATOR_COMPONENT_OFFSET_float (SIMD_LENGTH_float *((l->num_eig_vect+SIMD_LENGTH_float -1)/SIMD_LENGTH_float )) -#define OPERATOR_COMPONENT_OFFSET_double (SIMD_LENGTH_double*((l->num_eig_vect+SIMD_LENGTH_double-1)/SIMD_LENGTH_double)) - -#define OPERATOR_TYPE_float float -#define OPERATOR_TYPE_double double - -#endif // VECTORIZATION_CONTROL_H diff --git a/src/vectorization_dirac_generic.c b/src/vectorization_dirac_generic.c deleted file mode 100644 index 9ea2b3e..0000000 --- a/src/vectorization_dirac_generic.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE -void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = 12*offset; - int index_out; - int index_bw; - int index_fw; - int *neighbor = s->op.neighbor_table; - int *backward_neighbor = s->op.backward_neighbor_table; - complex_PRECISION *phi_pt; - complex_PRECISION buffer1[site_offset] __attribute__((aligned(64))); - complex_PRECISION buffer2[site_offset] __attribute__((aligned(64))); - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - - // add clover term/shift - spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset ); - spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset ); - - index_out = site; - - for(int mu=0; mu<4; mu++) { - index_fw = neighbor[4*index_out + mu]; - index_bw = backward_neighbor[4*index_out + mu]; - - // from backward - if ( direction_flags[2*mu+0] == 1 ) { - D_pt = D + 36*index_bw+9*mu; - phi_pt = phi + site_offset*index_bw; - mvmh_PRECISION_vectorized( buffer2+0*offset, D_pt, phi_pt+0*offset, offset ); - mvmh_PRECISION_vectorized( buffer2+3*offset, D_pt, phi_pt+3*offset, offset ); - mvmh_PRECISION_vectorized( buffer2+6*offset, D_pt, phi_pt+6*offset, offset ); - mvmh_PRECISION_vectorized( buffer2+9*offset, D_pt, phi_pt+9*offset, offset ); - twospin_PRECISION_vectorized( eta1, eta2, buffer2, offset, mu, -1.0 ); - } - - // from forward - if ( direction_flags[2*mu+1] == 1 ) { - D_pt = D + 36*index_out+9*mu; - phi_pt = phi + site_offset*index_fw; - mvm_PRECISION_vectorized( buffer1+0*offset, D_pt, phi_pt+0*offset, offset ); - mvm_PRECISION_vectorized( buffer1+3*offset, D_pt, phi_pt+3*offset, offset ); - mvm_PRECISION_vectorized( buffer1+6*offset, D_pt, phi_pt+6*offset, offset ); - mvm_PRECISION_vectorized( buffer1+9*offset, D_pt, phi_pt+9*offset, offset ); - twospin_PRECISION_vectorized( eta1, eta2, buffer1, offset, mu, 1.0 ); - } - } -} -#endif - -#ifdef SSE -void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = 12*offset; - int index_out; - int index_fw; - int *neighbor = s->op.neighbor_table; - complex_PRECISION *phi_pt; - complex_PRECISION buffer[site_offset] __attribute__((aligned(64))); - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - - index_out = site; - - // requires the positive boundaries of phi to be communicated befor - index_fw = neighbor[4*index_out + mu]; - D_pt = D + 36*index_out+9*mu; - phi_pt = phi + site_offset*index_fw; - mvm_PRECISION_vectorized_simd_length( buffer+0*offset, D_pt, phi_pt+0*offset ); - mvm_PRECISION_vectorized_simd_length( buffer+3*offset, D_pt, phi_pt+3*offset ); - mvm_PRECISION_vectorized_simd_length( buffer+6*offset, D_pt, phi_pt+6*offset ); - mvm_PRECISION_vectorized_simd_length( buffer+9*offset, D_pt, phi_pt+9*offset ); - twospin2_p_PRECISION_vectorized_simd_length( eta1, eta2, buffer, mu ); -} -#endif - -#ifdef SSE -void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = 12*offset; - - sse_diagonal_aggregate_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+12*site, offset ); -} -#endif diff --git a/src/vectorization_dirac_generic.h b/src/vectorization_dirac_generic.h deleted file mode 100644 index 5b8f02c..0000000 --- a/src/vectorization_dirac_generic.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef VECTORIZATION_DIRAC_PRECISION_HEADER - #define VECTORIZATION_DIRAC_PRECISION_HEADER - -#ifdef SSE - #include "sse_dirac.h" -#endif - - // caller is responsibel for checking that he needs coupling in this direction for this site - void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ); - - void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ); - - void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site ); - - // spinors are vectorized, gauge is same for all (use for multiple rhs) - static inline void mvm_PRECISION_vectorized_simd_length( - const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi ) { -#ifdef SSE - sse_mvm_PRECISION_simd_length( eta, D, phi ); -#endif - - } - // spinors are vectorized, gauge is same for all (use for multiple rhs) - static inline void mvm_PRECISION_vectorized( - const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi, int elements ) { -#ifdef SSE - sse_mvm_PRECISION( eta, D, phi, elements ); -#endif - } - - // spinors are vectorized, gauge is same for all (use for multiple rhs) - static inline void mvmh_PRECISION_vectorized( - const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi, int elements ) { -#ifdef SSE - sse_mvmh_PRECISION( eta, D, phi, elements ); -#endif - } - - // mu is according to the enum for T,Z,Y,X defined in clifford.h - static inline void twospin_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements, int mu, double sign ) { -#ifdef SSE - sse_twospin_PRECISION( out_spin0and1, out_spin2and3, in, elements, mu, sign ); -#endif - } - static inline void twospin_p_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T, 1.0); - } - static inline void twospin_n_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T, -1.0); - } - static inline void twospin_p_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z, 1.0); - } - static inline void twospin_n_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z, -1.0); - } - static inline void twospin_p_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y, 1.0); - } - static inline void twospin_n_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y, -1.0); - } - static inline void twospin_p_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X, 1.0); - } - static inline void twospin_n_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X, -1.0); - } - - // mu is according to the enum for T,Z,Y,X defined in clifford.h - static inline void twospin2_p_PRECISION_vectorized_simd_length( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int mu ) { -#ifdef SSE - sse_twospin2_p_PRECISION_simd_length( out_spin0and1, out_spin2and3, in, mu ); -#endif - } - // mu is according to the enum for T,Z,Y,X defined in clifford.h - static inline void twospin2_p_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements, int mu ) { -#ifdef SSE - sse_twospin2_p_PRECISION( out_spin0and1, out_spin2and3, in, elements, mu ); -#endif - } - static inline void twospin2_p_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T); - } - static inline void twospin2_p_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z); - } - static inline void twospin2_p_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y); - } - static inline void twospin2_p_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) { - twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X); - } - - static inline void spin0and1_site_clover_PRECISION_vectorized( const complex_PRECISION *eta, const complex_PRECISION *phi, - const config_PRECISION clover, double shift, int elements ) { -#ifdef SSE - sse_spin0and1_site_clover_PRECISION( eta, phi, clover, shift, elements ); -#endif - } - - static inline void spin2and3_site_clover_PRECISION_vectorized( const complex_PRECISION *eta, const complex_PRECISION *phi, - const config_PRECISION clover, double shift, int elements ) { -#ifdef SSE - sse_spin2and3_site_clover_PRECISION( eta, phi, clover, shift, elements ); -#endif - } - -#endif