diff --git a/build/.gitignore b/build/.gitignore
index 5761abc..35d987b 100644
--- a/build/.gitignore
+++ b/build/.gitignore
@@ -1 +1,4 @@
+*
 *.o
+!gsrc
+!.gitignore
\ No newline at end of file
diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c
index 2bc3f82..da3b343 100644
--- a/src/DDalphaAMG_interface.c
+++ b/src/DDalphaAMG_interface.c
@@ -570,7 +570,7 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) {
   }
 }
 
-static inline void vector_copy( vector_double vector_out, vector_double vector_in )
+static inline void vector_copy( vector_double *vector_out, vector_double *vector_in )
 {
   THREADED(threading[0]->n_core) {
     int start = threading[omp_get_thread_num()]->start_index[0], 
@@ -591,7 +591,7 @@ static inline void solver( )
     }
 }
 
-static inline void correct_guess( vector_double guess, vector_double solution, vector_double solution2,
+static inline void correct_guess( vector_double *guess, vector_double *solution, vector_double *solution2,
                                   double  even_dshift, double odd_dshift )
 {
   // guess = D^{-1}*rhs - i*dshift*D^{-2}*rhs 
@@ -666,8 +666,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
   complex_double twisted_bc, tmp1, tmp2;
   double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2;
   gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p);
-  vector_double vb=p->b, rhs = p->b;
-  vector_double vx=p->x, sol = p->x;
+  buffer_double vb=p->b.vector_buffer, vx=p->x.vector_buffer;
+  vector_double *rhs = &(p->b), *sol = &(p->x);
   DDalphaAMG_status tmp_status;
 
   double t0, t1;
@@ -717,40 +717,41 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
             for ( mu=0; mu<4; mu++ ) {
               for ( k=0; k<3; k++, j++ ) {
 #ifndef BASIS4 
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
-                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc;
 
 #else
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
-                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
 
                 if(p->initial_guess_zero == 0) {
 #ifndef BASIS4 
-                  sol[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc;
-                  sol[j+6] = ((complex_double)vector2_out[i+2*(k+3*mu)] + I*(complex_double)vector2_out[i+2*(k+3*mu)+1]) * twisted_bc;
+                  sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc;
+                  sol->vector_buffer[j+6] = ((complex_double)vector2_out[i+2*(k+3*mu)] + I*(complex_double)vector2_out[i+2*(k+3*mu)+1]) * twisted_bc;
 
 #else
-                  sol[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc;
-                  sol[j+6] = ((complex_double)vector2_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_out[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                  sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                  sol->vector_buffer[j+6] = ((complex_double)vector2_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_out[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
                 }
                 
 #ifndef INIT_ONE_PREC
                 if(g.mixed_precision==2) {
-                  vtmp=cabs(rhs[j]);
+                  vtmp=cabs(rhs->vector_buffer[j]);
                   if(vtmp > vmax)
                     vmax=vtmp;
                   if( vtmp > EPS_double && vtmp < vmin )
                     vmin=vtmp;
-                  vtmp=cabs(rhs[j+6]);
+                  vtmp=cabs(rhs->vector_buffer[j+6]);
                   if(vtmp > vmax)
                     vmax=vtmp;
                   if( vtmp > EPS_double && vtmp < vmin )
                     vmin=vtmp;
                 }
-              }
+              
 #endif
+              }
               if(mu%2)
                 j+=6;
             }
@@ -759,30 +760,31 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
             for ( mu=0; mu<4; mu++ )
               for ( k=0; k<3; k++, j++ ) {
 #ifndef BASIS4 
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
 #else
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
 
                 if(p->initial_guess_zero == 0) {
 #ifndef BASIS4 
-                  sol[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc;
+                  sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*mu)] + I*(complex_double)vector1_out[i+2*(k+3*mu)+1]) * twisted_bc;
 
 #else
-                  sol[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                  sol->vector_buffer[j] = ((complex_double)vector1_out[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_out[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
                 }
                 
 #ifndef INIT_ONE_PREC
                 if(g.mixed_precision==2) {
-                  vtmp=cabs(rhs[j]);
+                  vtmp=cabs(rhs->vector_buffer[j]);
                   if(vtmp > vmax)
                     vmax=vtmp;
                   if( vtmp > EPS_double && vtmp < vmin )
                     vmin=vtmp;
                 }
-              }
+              
 #endif
+              }
         }
       }
     }
@@ -803,10 +805,10 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
     g.mixed_precision=1;
     p = &(g.p);
     // storing pointer in x and b
-    vb = p->b; 
-    vx = p->x;
-    p->b = g.p_MP.dp.b;
-    p->x = g.p_MP.dp.x;
+    vb = p->b.vector_buffer; 
+    vx = p->x.vector_buffer;
+    p->b.vector_buffer = g.p_MP.dp.b.vector_buffer;
+    p->x.vector_buffer = g.p_MP.dp.x.vector_buffer;
     p->tol = g.p_MP.dp.tol;
   } else precision_changed = 0;
 #endif
@@ -984,8 +986,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
           if(g.n_flavours==2) {
             for ( mu=0; mu<4; mu++ ) {
               for ( k=0; k<3; k++, j++ ) {
-                tmp1 = sol[j] * twisted_bc;
-                tmp2 = sol[j+6] * twisted_bc;
+                tmp1 = sol->vector_buffer[j] * twisted_bc;
+                tmp2 = sol->vector_buffer[j+6] * twisted_bc;
 #ifndef BASIS4 
                 vector1_out[i+2*(k+3*mu)]   = creal(tmp1);
                 vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1);
@@ -1005,7 +1007,7 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
 #endif
             for ( mu=0; mu<4; mu++ )
               for ( k=0; k<3; k++, j++ ) {
-                tmp1 = sol[j] * twisted_bc;
+                tmp1 = sol->vector_buffer[j] * twisted_bc;
 #ifndef BASIS4 
                 vector1_out[i+2*(k+3*mu)]   = creal(tmp1);
                 vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1);
@@ -1023,8 +1025,8 @@ static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, d
   if (precision_changed) {
     g.mixed_precision=2;
     // recovering pointer from x and b
-    p->b = vb; 
-    p->x = vx;
+    p->b.vector_buffer = vb; 
+    p->x.vector_buffer = vx;
   }
 #endif
     
@@ -1049,9 +1051,14 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
   double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO},
     vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2;
   gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p);
-  vector_double vb, rhs = p->b;
-  vector_double vx, sol = p->x;
-  vector_double source = NULL, solution = NULL, solution2 = NULL;
+  buffer_double vb, vx;
+  vector_double *rhs =&(p->b), *sol = &(p->x); 
+  vector_double source, solution, solution2;
+
+  vector_double_init( &source );
+  vector_double_init( &solution );
+  vector_double_init( &solution2 );
+
   DDalphaAMG_status tmp_status;
 
   double t0, t1;
@@ -1102,29 +1109,30 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
             for ( mu=0; mu<4; mu++ ) {
               for ( k=0; k<3; k++, j++ ) {
 #ifndef BASIS4 
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
-                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc;
 
 #else
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
-                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs->vector_buffer[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
                 
 #ifndef INIT_ONE_PREC
                 if(g.mixed_precision==2) {
-                  vtmp=cabs(rhs[j]);
+                  vtmp=cabs(rhs->vector_buffer[j]);
                   if(vtmp > vmax)
                     vmax=vtmp;
                   if( vtmp > EPS_double && vtmp < vmin )
                     vmin=vtmp;
-                  vtmp=cabs(rhs[j+6]);
+                  vtmp=cabs(rhs->vector_buffer[j+6]);
                   if(vtmp > vmax)
                     vmax=vtmp;
                   if( vtmp > EPS_double && vtmp < vmin )
                     vmin=vtmp;
                 }
-              }
+              
 #endif
+              }
               if(mu%2)
                 j+=6;
             }
@@ -1133,21 +1141,22 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
             for ( mu=0; mu<4; mu++ )
               for ( k=0; k<3; k++, j++ ) {
 #ifndef BASIS4 
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
 #else
-                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs->vector_buffer[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
                 
 #ifndef INIT_ONE_PREC
                 if( g.mixed_precision == 2 ) {
-                  vtmp = cabs(rhs[j]);
+                  vtmp = cabs(rhs->vector_buffer[j]);
                   if(vtmp > vmax)
                     vmax = vtmp;
                   if( vtmp > EPS_double && vtmp < vmin )
                     vmin = vtmp;
                 }
-              }
+              
 #endif
+              }
         }
       }
     }
@@ -1168,8 +1177,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
     g.mixed_precision=1;
     p = &(g.p);
     // storing pointer in x and b
-    vb = p->b; 
-    vx = p->x;
+    vb = p->b.vector_buffer; 
+    vx = p->x.vector_buffer;
     p->b = g.p_MP.dp.b;
     p->x = g.p_MP.dp.x;
     p->tol = g.p_MP.dp.tol;
@@ -1181,10 +1190,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
     ASSERT( odd_shifts != NULL );
   }
   if ( n_shifts > 1 ) {
-    MALLOC( source, complex_double, l.inner_vector_size );
-    MALLOC( solution, complex_double, l.inner_vector_size );
+    vector_double_alloc( &source, _INNER, 1, &l, no_threading);
+    vector_double_alloc( &solution, _INNER, 1, &l, no_threading);
     if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN )
-      MALLOC( solution2, complex_double, l.inner_vector_size );
+      vector_double_alloc( &solution2, _INNER, 1, &l, no_threading);
   }
   
   for ( n = 0; n < n_shifts; n++ ) {
@@ -1220,10 +1229,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
       
     case _SOLVE :
       if ( n ) {
-        vector_copy( rhs, source );
+        vector_copy( rhs, &source );
         p->initial_guess_zero = 0;
       } else if ( n_shifts > 1 )
-        vector_copy( source, rhs );
+        vector_copy( &source, rhs );
       
       solver( );
       break;
@@ -1231,7 +1240,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
       
     case _SOLVE_SQ :
       if ( n ) {
-        vector_copy( rhs, source );
+        vector_copy( rhs, &source );
         p->initial_guess_zero = 0;
       } else if ( n_shifts > 1 ) {
         THREADED(threading[0]->n_core) 
@@ -1243,18 +1252,18 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
 #endif
             // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
             gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] );
-        vector_copy( source, rhs );
+        vector_copy( &source, rhs );
       }
 
       if( n )
-        correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
+        correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
       // read NOTE RESIDUAL
       THREADED(threading[0]->n_core)
         nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
       p->tol = tol[n]/2.;
       solver( );
       if ( n < n_shifts-1 ) 
-        vector_copy( solution, sol );
+        vector_copy( &solution, sol );
         
       THREADED(threading[0]->n_core) 
 #ifdef HAVE_TM1p1
@@ -1272,7 +1281,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
         DDalphaAMG_change_mu_sign( &tmp_status );
 
       if( n )
-        vector_copy( sol, solution2 );
+        vector_copy( sol, &solution2 );
 
       // read NOTE RESIDUAL
       THREADED(threading[0]->n_core)
@@ -1280,7 +1289,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
       p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.;
       solver( );
       if ( n < n_shifts-1 ) 
-        vector_copy( solution2, sol );
+        vector_copy( &solution2, sol );
      
       // DDalphaAMG_change_mu_sign( &tmp_status );
       warning0("sign of mu changed during the inversion of squared operator\n");
@@ -1289,7 +1298,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
 
     case _SOLVE_SQ_ODD :    
       if ( n ) {
-        vector_copy( rhs, source );
+        vector_copy( rhs, &source );
         p->initial_guess_zero = 0;
       } else if ( n_shifts > 1 ) {
         THREADED(threading[0]->n_core)
@@ -1302,11 +1311,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
             // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
             gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
        
-        vector_copy( source, rhs );
+        vector_copy( &source, rhs );
       }
 
       if( n )
-        correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
+        correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
 
       // read NOTE RESIDUAL
       THREADED(threading[0]->n_core)
@@ -1314,7 +1323,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
       p->tol = tol[n]/2.;
       solver( );
       if ( n < n_shifts-1 ) 
-        vector_copy( solution, sol );
+        vector_copy( &solution, sol );
 
       THREADED(threading[0]->n_core)
 #ifdef HAVE_TM1p1
@@ -1332,7 +1341,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
         DDalphaAMG_change_mu_sign( &tmp_status );
 
       if( n )
-        vector_copy( sol, solution2 );
+        vector_copy( sol, &solution2 );
 
       // read NOTE RESIDUAL
       THREADED(threading[0]->n_core)
@@ -1340,7 +1349,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
       p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.;
       solver( );
       if ( n < n_shifts-1 ) 
-        vector_copy( solution2, sol );
+        vector_copy( &solution2, sol );
 
       // DDalphaAMG_change_mu_sign( &tmp_status );
       warning0("sign of mu changed during the inversion of squared operator\n");
@@ -1349,7 +1358,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
 
     case _SOLVE_SQ_EVEN :    
       if ( n ) {
-        vector_copy( rhs, source );
+        vector_copy( rhs, &source );
         p->initial_guess_zero = 0;
       } else if ( n_shifts > 1 ) {
         THREADED(threading[0]->n_core)
@@ -1362,11 +1371,11 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
             // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
             gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
 
-        vector_copy( source, rhs );
+        vector_copy( &source, rhs );
       }
 
       if( n )
-        correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
+        correct_guess( sol, &solution, &solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
 
       // read NOTE RESIDUAL
       THREADED(threading[0]->n_core)
@@ -1374,7 +1383,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
       p->tol = tol[n]/2.;
       solver( );
       if ( n < n_shifts-1 ) 
-        vector_copy( solution, sol );
+        vector_copy( &solution, sol );
 
       THREADED(threading[0]->n_core)
 #ifdef HAVE_TM1p1
@@ -1392,14 +1401,14 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
         DDalphaAMG_change_mu_sign( &tmp_status );
       
       if( n )
-        vector_copy( sol, solution2 );
+        vector_copy( sol, &solution2 );
       // read NOTE RESIDUAL
       THREADED(threading[0]->n_core)
         nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
       p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.;
       solver( );
       if ( n < n_shifts-1 ) 
-        vector_copy( solution2, sol );
+        vector_copy( &solution2, sol );
 
       // DDalphaAMG_change_mu_sign( &tmp_status );
       warning0("sign of mu changed during the inversion of squared operator\n");
@@ -1449,8 +1458,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
             if(g.n_flavours==2) {
               for ( mu=0; mu<4; mu++ ) {
                 for ( k=0; k<3; k++, j++ ) {
-                  tmp1 = sol[j] * twisted_bc;
-                  tmp2 = sol[j+6] * twisted_bc;
+                  tmp1 = sol->vector_buffer[j] * twisted_bc;
+                  tmp2 = sol->vector_buffer[j+6] * twisted_bc;
 #ifndef BASIS4 
                   vector1_out[n][i+2*(k+3*mu)]   = creal(tmp1);
                   vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1);
@@ -1470,7 +1479,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
 #endif
               for ( mu=0; mu<4; mu++ )
                 for ( k=0; k<3; k++, j++ ) {
-                  tmp1 = sol[j] * twisted_bc;
+                  tmp1 = sol->vector_buffer[j] * twisted_bc;
 #ifndef BASIS4 
                   vector1_out[n][i+2*(k+3*mu)]   = creal(tmp1);
                   vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1);
@@ -1488,10 +1497,10 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
 
   p->initial_guess_zero = 1;
   if ( n_shifts > 0 ) {
-    FREE( source, complex_double, l.inner_vector_size );
-    FREE( solution, complex_double, l.inner_vector_size );
+    vector_double_free( &source, &l, no_threading);
+    vector_double_free( &solution, &l, no_threading);
     if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN )
-      FREE( solution2, complex_double, l.inner_vector_size );
+      vector_double_free( &solution2, &l, no_threading);
   }
 
   
@@ -1499,8 +1508,8 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
   if (precision_changed) {
     g.mixed_precision=2;
     // recovering pointer from x and b
-    p->b = vb; 
-    p->x = vx;
+    p->b.vector_buffer = vb; 
+    p->x.vector_buffer = vx;
   }
 #endif
     
@@ -1533,8 +1542,8 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in
     from=ltmp->next_level;
     to=ltmp;    
   }
-  vector_float rhs = from->p_float.b;
-  vector_float sol = to->p_float.x;
+  vector_float *rhs = &(from->p_float.b);
+  vector_float *sol = &(to->p_float.x);
 
   double t0, t1;
   t0 = MPI_Wtime();
@@ -1559,7 +1568,7 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in
             i = 2*j;
           
           for ( mu=0; mu<from->num_lattice_site_var; mu++, j++ )
-            rhs[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]);
+            rhs->vector_buffer[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]);
         }
 
   switch(_TYPE) {
@@ -1596,8 +1605,8 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in
             i = 2*j;
           
           for ( mu=0; mu<to->num_lattice_site_var; mu++, j++ ) {
-            vector_out[i+2*mu]   = (double) creal(sol[j]);
-            vector_out[i+2*mu+1] = (double) cimag(sol[j]);
+            vector_out[i+2*mu]   = (double) creal(sol->vector_buffer[j]);
+            vector_out[i+2*mu+1] = (double) cimag(sol->vector_buffer[j]);
           }
         }
 
@@ -1839,7 +1848,9 @@ void DDalphaAMG_define_vector_const( double *vector, double re, double im ) {
   if(vector!=NULL){
     int start, end;
     compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]);
-    vector_double_define( (vector_double) vector, re+I*im, start, end, &l );
+    vector_double vec;
+    vec.vector_buffer= (buffer_double) vector;
+    vector_double_define( &vec, re+I*im, start, end, &l );
   }
   else {
     warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
@@ -1852,7 +1863,9 @@ void DDalphaAMG_define_vector_rand( double *vector ) {
   if(vector!=NULL){
     int start, end;
     compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]);
-    vector_double_define_random( (vector_double) vector, start, end, &l );
+    vector_double vec;
+    vec.vector_buffer= (buffer_double) vector;
+    vector_double_define_random( &vec, start, end, &l );
   }
   else {
     warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
@@ -1865,7 +1878,9 @@ double DDalphaAMG_vector_norm( double *vector ) {
   double norm = 0;
   THREADED(threading[0]->n_core)
   if(vector!=NULL){
-    norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
+    vector_double vec;
+    vec.vector_buffer = (buffer_double) vector;
+    norm = global_norm_double( &vec, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
    }
   else {
     warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
@@ -1880,7 +1895,9 @@ void DDalphaAMG_vector_saxpy( double *vector_out, double a, double *x, double *y
   if(vector_out!=NULL && x!=NULL && y!=NULL){
     int start, end;
     compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]);
-    vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, start, end, &l );
+    vector_double vec_out, xx, yy;
+    vec_out.vector_buffer= (buffer_double) vector_out; xx.vector_buffer= (buffer_double) x; yy.vector_buffer= (buffer_double) y;
+    vector_double_saxpy( &vec_out, &xx, &yy, a, start, end, &l );
   }
   else {
     warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
diff --git a/src/blas_vectorized.h b/src/blas_vectorized.h
deleted file mode 100644
index 645c457..0000000
--- a/src/blas_vectorized.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef BLAS_VECTORIZED_H
-#define BLAS_VECTORIZED_H
-
-// BLAS naming convention: LDA = leading dimension of A
-#ifdef SSE
-#include "sse_blas_vectorized.h"
-#endif
-
-// C=A*B+C
-static inline void cgemv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C)
-{
-#ifdef SSE
-  sse_cgemv( N, A, lda, B, C );
-#endif
-}
-
-// C=-A*B+C
-static inline void cgenmv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C)
-{
-#ifdef SSE
-  sse_cgenmv( N, A, lda, B, C );
-#endif
-}
-
-// C=A*B+C with padded layout
-static inline void cgemv_padded(const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C)
-{
-#ifdef SSE
-  sse_cgemv_padded( N, A, lda, padded, B, C );
-#endif
-}
-
-// C=-A*B+C with padded layout
-static inline void cgenmv_padded(const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C)
-{
-#ifdef SSE
-  sse_cgenmv_padded( N, A, lda, padded, B, C );
-#endif
-}
-
-
-static inline void cgem_inverse(const int N, OPERATOR_TYPE_float *A_inverse, OPERATOR_TYPE_float *A, int lda)
-{
-#ifdef SSE
-  sse_cgem_inverse( N, A_inverse, A, lda );
-#endif
-}
-
-#endif // BLAS_VECTORIZED_H
diff --git a/src/clifford.h b/src/clifford.h
index 6521566..9307579 100644
--- a/src/clifford.h
+++ b/src/clifford.h
@@ -99,64 +99,6 @@
     #define GAMMA_X_SPIN2_VAL  I
     #define GAMMA_X_SPIN3_CO   1
     #define GAMMA_X_SPIN3_VAL -I
-#ifdef SSE
-    #define GAMMA_T_SPIN0_RE_SIGN -1
-    #define GAMMA_T_SPIN1_RE_SIGN -1
-    #define GAMMA_T_SPIN2_RE_SIGN -1
-    #define GAMMA_T_SPIN3_RE_SIGN -1
-    #define GAMMA_T_SPIN0_IM_SIGN -1
-    #define GAMMA_T_SPIN1_IM_SIGN -1
-    #define GAMMA_T_SPIN2_IM_SIGN -1
-    #define GAMMA_T_SPIN3_IM_SIGN -1
-    #define GAMMA_T_SPIN0_OFFSET 0
-    #define GAMMA_T_SPIN1_OFFSET 0
-    #define GAMMA_T_SPIN2_OFFSET 0
-    #define GAMMA_T_SPIN3_OFFSET 0
-    
-    #define GAMMA_Z_SPIN0_RE_SIGN +1
-    #define GAMMA_Z_SPIN1_RE_SIGN +1
-    #define GAMMA_Z_SPIN2_RE_SIGN -1
-    #define GAMMA_Z_SPIN3_RE_SIGN -1
-    #define GAMMA_Z_SPIN0_IM_SIGN -1
-    #define GAMMA_Z_SPIN1_IM_SIGN -1
-    #define GAMMA_Z_SPIN2_IM_SIGN +1
-    #define GAMMA_Z_SPIN3_IM_SIGN +1
-    #define GAMMA_Z_SPIN0_OFFSET 1
-    #define GAMMA_Z_SPIN1_OFFSET 1
-    #define GAMMA_Z_SPIN2_OFFSET 1
-    #define GAMMA_Z_SPIN3_OFFSET 1
-    
-    #define GAMMA_Y_SPIN0_RE_SIGN -1
-    #define GAMMA_Y_SPIN1_RE_SIGN +1
-    #define GAMMA_Y_SPIN2_RE_SIGN +1
-    #define GAMMA_Y_SPIN3_RE_SIGN -1
-    #define GAMMA_Y_SPIN0_IM_SIGN -1
-    #define GAMMA_Y_SPIN1_IM_SIGN +1
-    #define GAMMA_Y_SPIN2_IM_SIGN +1
-    #define GAMMA_Y_SPIN3_IM_SIGN -1
-    #define GAMMA_Y_SPIN0_OFFSET 0
-    #define GAMMA_Y_SPIN1_OFFSET 0
-    #define GAMMA_Y_SPIN2_OFFSET 0
-    #define GAMMA_Y_SPIN3_OFFSET 0
-    
-    #define GAMMA_X_SPIN0_RE_SIGN +1
-    #define GAMMA_X_SPIN1_RE_SIGN -1
-    #define GAMMA_X_SPIN2_RE_SIGN -1
-    #define GAMMA_X_SPIN3_RE_SIGN +1
-    #define GAMMA_X_SPIN0_IM_SIGN -1
-    #define GAMMA_X_SPIN1_IM_SIGN +1
-    #define GAMMA_X_SPIN2_IM_SIGN +1
-    #define GAMMA_X_SPIN3_IM_SIGN -1
-    #define GAMMA_X_SPIN0_OFFSET 1
-    #define GAMMA_X_SPIN1_OFFSET 1
-    #define GAMMA_X_SPIN2_OFFSET 1
-    #define GAMMA_X_SPIN3_OFFSET 1
-    
-    #define GAMMA_T_SHUFFLE(A) A
-    #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-    #define GAMMA_Y_SHUFFLE(A) A
-    #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-#endif
   
   /* ------------------------------------------------- */
   #else
@@ -224,64 +166,6 @@
       #define GAMMA_X_SPIN2_VAL  I
       #define GAMMA_X_SPIN3_CO   0
       #define GAMMA_X_SPIN3_VAL  I
-#ifdef SSE
-      #define GAMMA_T_SPIN0_RE_SIGN -1
-      #define GAMMA_T_SPIN1_RE_SIGN -1
-      #define GAMMA_T_SPIN2_RE_SIGN -1
-      #define GAMMA_T_SPIN3_RE_SIGN -1
-      #define GAMMA_T_SPIN0_IM_SIGN -1
-      #define GAMMA_T_SPIN1_IM_SIGN -1
-      #define GAMMA_T_SPIN2_IM_SIGN -1
-      #define GAMMA_T_SPIN3_IM_SIGN -1
-      #define GAMMA_T_SPIN0_OFFSET 0
-      #define GAMMA_T_SPIN1_OFFSET 0
-      #define GAMMA_T_SPIN2_OFFSET 0
-      #define GAMMA_T_SPIN3_OFFSET 0
-      
-      #define GAMMA_Z_SPIN0_RE_SIGN +1
-      #define GAMMA_Z_SPIN1_RE_SIGN -1
-      #define GAMMA_Z_SPIN2_RE_SIGN -1
-      #define GAMMA_Z_SPIN3_RE_SIGN +1
-      #define GAMMA_Z_SPIN0_IM_SIGN -1
-      #define GAMMA_Z_SPIN1_IM_SIGN +1
-      #define GAMMA_Z_SPIN2_IM_SIGN +1
-      #define GAMMA_Z_SPIN3_IM_SIGN -1
-      #define GAMMA_Z_SPIN0_OFFSET 1
-      #define GAMMA_Z_SPIN1_OFFSET 1
-      #define GAMMA_Z_SPIN2_OFFSET 1
-      #define GAMMA_Z_SPIN3_OFFSET 1
-      
-      #define GAMMA_Y_SPIN0_RE_SIGN +1
-      #define GAMMA_Y_SPIN1_RE_SIGN -1
-      #define GAMMA_Y_SPIN2_RE_SIGN -1
-      #define GAMMA_Y_SPIN3_RE_SIGN +1
-      #define GAMMA_Y_SPIN0_IM_SIGN +1
-      #define GAMMA_Y_SPIN1_IM_SIGN -1
-      #define GAMMA_Y_SPIN2_IM_SIGN -1
-      #define GAMMA_Y_SPIN3_IM_SIGN +1
-      #define GAMMA_Y_SPIN0_OFFSET 0
-      #define GAMMA_Y_SPIN1_OFFSET 0
-      #define GAMMA_Y_SPIN2_OFFSET 0
-      #define GAMMA_Y_SPIN3_OFFSET 0
-      
-      #define GAMMA_X_SPIN0_RE_SIGN +1
-      #define GAMMA_X_SPIN1_RE_SIGN +1
-      #define GAMMA_X_SPIN2_RE_SIGN -1
-      #define GAMMA_X_SPIN3_RE_SIGN -1
-      #define GAMMA_X_SPIN0_IM_SIGN -1
-      #define GAMMA_X_SPIN1_IM_SIGN -1
-      #define GAMMA_X_SPIN2_IM_SIGN +1
-      #define GAMMA_X_SPIN3_IM_SIGN +1
-      #define GAMMA_X_SPIN0_OFFSET 1
-      #define GAMMA_X_SPIN1_OFFSET 1
-      #define GAMMA_X_SPIN2_OFFSET 1
-      #define GAMMA_X_SPIN3_OFFSET 1
-      
-      #define GAMMA_T_SHUFFLE(A) A
-      #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-      #define GAMMA_Y_SHUFFLE(A) A
-      #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-#endif    
   /* ------------------------------------------------- */
     #else
       #ifdef BASIS2
@@ -346,64 +230,6 @@
         #define GAMMA_X_SPIN2_VAL -I
         #define GAMMA_X_SPIN3_CO   0
         #define GAMMA_X_SPIN3_VAL -I
-#ifdef SSE
-        #define GAMMA_T_SPIN0_RE_SIGN +1
-        #define GAMMA_T_SPIN1_RE_SIGN +1
-        #define GAMMA_T_SPIN2_RE_SIGN +1
-        #define GAMMA_T_SPIN3_RE_SIGN +1
-        #define GAMMA_T_SPIN0_IM_SIGN +1
-        #define GAMMA_T_SPIN1_IM_SIGN +1
-        #define GAMMA_T_SPIN2_IM_SIGN +1
-        #define GAMMA_T_SPIN3_IM_SIGN +1
-        #define GAMMA_T_SPIN0_OFFSET 0
-        #define GAMMA_T_SPIN1_OFFSET 0
-        #define GAMMA_T_SPIN2_OFFSET 0
-        #define GAMMA_T_SPIN3_OFFSET 0
-        
-        #define GAMMA_Z_SPIN0_RE_SIGN -1
-        #define GAMMA_Z_SPIN1_RE_SIGN +1
-        #define GAMMA_Z_SPIN2_RE_SIGN +1
-        #define GAMMA_Z_SPIN3_RE_SIGN -1
-        #define GAMMA_Z_SPIN0_IM_SIGN +1
-        #define GAMMA_Z_SPIN1_IM_SIGN -1
-        #define GAMMA_Z_SPIN2_IM_SIGN -1
-        #define GAMMA_Z_SPIN3_IM_SIGN +1
-        #define GAMMA_Z_SPIN0_OFFSET 1
-        #define GAMMA_Z_SPIN1_OFFSET 1
-        #define GAMMA_Z_SPIN2_OFFSET 1
-        #define GAMMA_Z_SPIN3_OFFSET 1
-        
-        #define GAMMA_Y_SPIN0_RE_SIGN -1
-        #define GAMMA_Y_SPIN1_RE_SIGN +1
-        #define GAMMA_Y_SPIN2_RE_SIGN +1
-        #define GAMMA_Y_SPIN3_RE_SIGN -1
-        #define GAMMA_Y_SPIN0_IM_SIGN -1
-        #define GAMMA_Y_SPIN1_IM_SIGN +1
-        #define GAMMA_Y_SPIN2_IM_SIGN +1
-        #define GAMMA_Y_SPIN3_IM_SIGN -1
-        #define GAMMA_Y_SPIN0_OFFSET 0
-        #define GAMMA_Y_SPIN1_OFFSET 0
-        #define GAMMA_Y_SPIN2_OFFSET 0
-        #define GAMMA_Y_SPIN3_OFFSET 0
-        
-        #define GAMMA_X_SPIN0_RE_SIGN -1
-        #define GAMMA_X_SPIN1_RE_SIGN -1
-        #define GAMMA_X_SPIN2_RE_SIGN +1
-        #define GAMMA_X_SPIN3_RE_SIGN +1
-        #define GAMMA_X_SPIN0_IM_SIGN +1
-        #define GAMMA_X_SPIN1_IM_SIGN +1
-        #define GAMMA_X_SPIN2_IM_SIGN -1
-        #define GAMMA_X_SPIN3_IM_SIGN -1
-        #define GAMMA_X_SPIN0_OFFSET 1
-        #define GAMMA_X_SPIN1_OFFSET 1
-        #define GAMMA_X_SPIN2_OFFSET 1
-        #define GAMMA_X_SPIN3_OFFSET 1
- 
-        #define GAMMA_T_SHUFFLE(A) A
-        #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-        #define GAMMA_Y_SHUFFLE(A) A
-        #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-#endif
       #else
         #ifdef BASIS3
           // Basis used in the QOPQDP Code (by James Osborn/USQCD)
@@ -467,64 +293,6 @@
           #define GAMMA_X_SPIN2_VAL -I
           #define GAMMA_X_SPIN3_CO   1
           #define GAMMA_X_SPIN3_VAL  I
-#ifdef SSE
-          #define GAMMA_T_SPIN0_RE_SIGN +1
-          #define GAMMA_T_SPIN1_RE_SIGN +1
-          #define GAMMA_T_SPIN2_RE_SIGN +1
-          #define GAMMA_T_SPIN3_RE_SIGN +1
-          #define GAMMA_T_SPIN0_IM_SIGN +1
-          #define GAMMA_T_SPIN1_IM_SIGN +1
-          #define GAMMA_T_SPIN2_IM_SIGN +1
-          #define GAMMA_T_SPIN3_IM_SIGN +1
-          #define GAMMA_T_SPIN0_OFFSET 0
-          #define GAMMA_T_SPIN1_OFFSET 0
-          #define GAMMA_T_SPIN2_OFFSET 0
-          #define GAMMA_T_SPIN3_OFFSET 0
-          
-          #define GAMMA_Z_SPIN0_RE_SIGN -1
-          #define GAMMA_Z_SPIN1_RE_SIGN -1
-          #define GAMMA_Z_SPIN2_RE_SIGN +1
-          #define GAMMA_Z_SPIN3_RE_SIGN +1
-          #define GAMMA_Z_SPIN0_IM_SIGN +1
-          #define GAMMA_Z_SPIN1_IM_SIGN +1
-          #define GAMMA_Z_SPIN2_IM_SIGN -1
-          #define GAMMA_Z_SPIN3_IM_SIGN -1
-          #define GAMMA_Z_SPIN0_OFFSET 1
-          #define GAMMA_Z_SPIN1_OFFSET 1
-          #define GAMMA_Z_SPIN2_OFFSET 1
-          #define GAMMA_Z_SPIN3_OFFSET 1
-          
-          #define GAMMA_Y_SPIN0_RE_SIGN -1
-          #define GAMMA_Y_SPIN1_RE_SIGN +1
-          #define GAMMA_Y_SPIN2_RE_SIGN +1
-          #define GAMMA_Y_SPIN3_RE_SIGN -1
-          #define GAMMA_Y_SPIN0_IM_SIGN -1
-          #define GAMMA_Y_SPIN1_IM_SIGN +1
-          #define GAMMA_Y_SPIN2_IM_SIGN +1
-          #define GAMMA_Y_SPIN3_IM_SIGN -1
-          #define GAMMA_Y_SPIN0_OFFSET 0
-          #define GAMMA_Y_SPIN1_OFFSET 0
-          #define GAMMA_Y_SPIN2_OFFSET 0
-          #define GAMMA_Y_SPIN3_OFFSET 0
-          
-          #define GAMMA_X_SPIN0_RE_SIGN -1
-          #define GAMMA_X_SPIN1_RE_SIGN +1
-          #define GAMMA_X_SPIN2_RE_SIGN +1
-          #define GAMMA_X_SPIN3_RE_SIGN -1
-          #define GAMMA_X_SPIN0_IM_SIGN +1
-          #define GAMMA_X_SPIN1_IM_SIGN -1
-          #define GAMMA_X_SPIN2_IM_SIGN -1
-          #define GAMMA_X_SPIN3_IM_SIGN +1
-          #define GAMMA_X_SPIN0_OFFSET 1
-          #define GAMMA_X_SPIN1_OFFSET 1
-          #define GAMMA_X_SPIN2_OFFSET 1
-          #define GAMMA_X_SPIN3_OFFSET 1
-          
-          #define GAMMA_T_SHUFFLE(A) A
-          #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-          #define GAMMA_Y_SHUFFLE(A) A
-          #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-#endif
 #else
     #ifdef BASIS4
       // tmLQCD BASIS with an addition change of sign in gamma5
@@ -589,100 +357,10 @@
       #define GAMMA_X_SPIN2_VAL -I
       #define GAMMA_X_SPIN3_CO   0
       #define GAMMA_X_SPIN3_VAL -I
-#ifdef SSE
-      #define GAMMA_T_SPIN0_RE_SIGN -1
-      #define GAMMA_T_SPIN1_RE_SIGN -1
-      #define GAMMA_T_SPIN2_RE_SIGN -1
-      #define GAMMA_T_SPIN3_RE_SIGN -1
-      #define GAMMA_T_SPIN0_IM_SIGN -1
-      #define GAMMA_T_SPIN1_IM_SIGN -1
-      #define GAMMA_T_SPIN2_IM_SIGN -1
-      #define GAMMA_T_SPIN3_IM_SIGN -1
-      #define GAMMA_T_SPIN0_OFFSET 0
-      #define GAMMA_T_SPIN1_OFFSET 0
-      #define GAMMA_T_SPIN2_OFFSET 0
-      #define GAMMA_T_SPIN3_OFFSET 0
-      
-      #define GAMMA_Z_SPIN0_RE_SIGN +1
-      #define GAMMA_Z_SPIN1_RE_SIGN -1
-      #define GAMMA_Z_SPIN2_RE_SIGN -1
-      #define GAMMA_Z_SPIN3_RE_SIGN +1
-      #define GAMMA_Z_SPIN0_IM_SIGN -1
-      #define GAMMA_Z_SPIN1_IM_SIGN +1
-      #define GAMMA_Z_SPIN2_IM_SIGN +1
-      #define GAMMA_Z_SPIN3_IM_SIGN -1
-      #define GAMMA_Z_SPIN0_OFFSET 1
-      #define GAMMA_Z_SPIN1_OFFSET 1
-      #define GAMMA_Z_SPIN2_OFFSET 1
-      #define GAMMA_Z_SPIN3_OFFSET 1
-      
-      #define GAMMA_Y_SPIN0_RE_SIGN -1
-      #define GAMMA_Y_SPIN1_RE_SIGN +1
-      #define GAMMA_Y_SPIN2_RE_SIGN +1
-      #define GAMMA_Y_SPIN3_RE_SIGN -1
-      #define GAMMA_Y_SPIN0_IM_SIGN -1
-      #define GAMMA_Y_SPIN1_IM_SIGN +1
-      #define GAMMA_Y_SPIN2_IM_SIGN +1
-      #define GAMMA_Y_SPIN3_IM_SIGN -1
-      #define GAMMA_Y_SPIN0_OFFSET 0
-      #define GAMMA_Y_SPIN1_OFFSET 0
-      #define GAMMA_Y_SPIN2_OFFSET 0
-      #define GAMMA_Y_SPIN3_OFFSET 0
-      
-      #define GAMMA_X_SPIN0_RE_SIGN -1
-      #define GAMMA_X_SPIN1_RE_SIGN -1
-      #define GAMMA_X_SPIN2_RE_SIGN +1
-      #define GAMMA_X_SPIN3_RE_SIGN +1
-      #define GAMMA_X_SPIN0_IM_SIGN +1
-      #define GAMMA_X_SPIN1_IM_SIGN +1
-      #define GAMMA_X_SPIN2_IM_SIGN -1
-      #define GAMMA_X_SPIN3_IM_SIGN -1
-      #define GAMMA_X_SPIN0_OFFSET 1
-      #define GAMMA_X_SPIN1_OFFSET 1
-      #define GAMMA_X_SPIN2_OFFSET 1
-      #define GAMMA_X_SPIN3_OFFSET 1
-      
-      #define GAMMA_T_SHUFFLE(A) A
-      #define GAMMA_Z_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-      #define GAMMA_Y_SHUFFLE(A) A
-      #define GAMMA_X_SHUFFLE(A) _mm_shuffle_pd(A,A,_MM_SHUFFLE2(0,1))
-#endif      /* ------------------------------------------------- */
+      /* ------------------------------------------------- */
           #endif
         #endif
       #endif
     #endif
   #endif
-
-#ifdef SSE
-static const int gamma_co[4][4] = {
-  {GAMMA_T_SPIN0_CO, GAMMA_T_SPIN1_CO, GAMMA_T_SPIN2_CO, GAMMA_T_SPIN3_CO},
-  {GAMMA_Z_SPIN0_CO, GAMMA_Z_SPIN1_CO, GAMMA_Z_SPIN2_CO, GAMMA_Z_SPIN3_CO},
-  {GAMMA_Y_SPIN0_CO, GAMMA_Y_SPIN1_CO, GAMMA_Y_SPIN2_CO, GAMMA_Y_SPIN3_CO},
-  {GAMMA_X_SPIN0_CO, GAMMA_X_SPIN1_CO, GAMMA_X_SPIN2_CO, GAMMA_X_SPIN3_CO}};
-
-static const double complex gamma_val[4][4] = {
-  {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL},
-  {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL},
-  {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL},
-  {GAMMA_X_SPIN0_VAL, GAMMA_X_SPIN1_VAL, GAMMA_X_SPIN2_VAL, GAMMA_X_SPIN3_VAL}};
-  
-static const int gamma_offset[4][4] = { 
-  {GAMMA_T_SPIN0_OFFSET,GAMMA_T_SPIN1_OFFSET,GAMMA_T_SPIN2_OFFSET,GAMMA_T_SPIN3_OFFSET},
-  {GAMMA_Z_SPIN0_OFFSET,GAMMA_Z_SPIN1_OFFSET,GAMMA_Z_SPIN2_OFFSET,GAMMA_Z_SPIN3_OFFSET},
-  {GAMMA_Y_SPIN0_OFFSET,GAMMA_Y_SPIN1_OFFSET,GAMMA_Y_SPIN2_OFFSET,GAMMA_Y_SPIN3_OFFSET},
-  {GAMMA_X_SPIN0_OFFSET,GAMMA_X_SPIN1_OFFSET,GAMMA_X_SPIN2_OFFSET,GAMMA_X_SPIN3_OFFSET}};
-  
-static const int gamma_re_sign[4][4] = { 
-  {GAMMA_T_SPIN0_RE_SIGN,GAMMA_T_SPIN1_RE_SIGN,GAMMA_T_SPIN2_RE_SIGN,GAMMA_T_SPIN3_RE_SIGN},
-  {GAMMA_Z_SPIN0_RE_SIGN,GAMMA_Z_SPIN1_RE_SIGN,GAMMA_Z_SPIN2_RE_SIGN,GAMMA_Z_SPIN3_RE_SIGN},
-  {GAMMA_Y_SPIN0_RE_SIGN,GAMMA_Y_SPIN1_RE_SIGN,GAMMA_Y_SPIN2_RE_SIGN,GAMMA_Y_SPIN3_RE_SIGN},
-  {GAMMA_X_SPIN0_RE_SIGN,GAMMA_X_SPIN1_RE_SIGN,GAMMA_X_SPIN2_RE_SIGN,GAMMA_X_SPIN3_RE_SIGN}};
-  
-static const int gamma_im_sign[4][4] = { 
-  {GAMMA_T_SPIN0_IM_SIGN,GAMMA_T_SPIN1_IM_SIGN,GAMMA_T_SPIN2_IM_SIGN,GAMMA_T_SPIN3_IM_SIGN},
-  {GAMMA_Z_SPIN0_IM_SIGN,GAMMA_Z_SPIN1_IM_SIGN,GAMMA_Z_SPIN2_IM_SIGN,GAMMA_Z_SPIN3_IM_SIGN},
-  {GAMMA_Y_SPIN0_IM_SIGN,GAMMA_Y_SPIN1_IM_SIGN,GAMMA_Y_SPIN2_IM_SIGN,GAMMA_Y_SPIN3_IM_SIGN},
-  {GAMMA_X_SPIN0_IM_SIGN,GAMMA_X_SPIN1_IM_SIGN,GAMMA_X_SPIN2_IM_SIGN,GAMMA_X_SPIN3_IM_SIGN}};
-#endif
-  
 #endif
diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c
index b0baa6a..a5c4e5a 100644
--- a/src/coarse_oddeven_generic.c
+++ b/src/coarse_oddeven_generic.c
@@ -254,62 +254,59 @@ void coarse_selfcoupling_LU_doublet_decomposition_PRECISION( config_PRECISION ou
 #endif
 
 
-void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION A, level_struct *l ) {
+void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION A, level_struct *l ) {
   
   register int i, j, n2 = l->num_lattice_site_var;
   
   // solve x = U^(-1) L^(-1) b
   // forward substitution with L
   for ( i=0; i<n2; i++ ) {
-    x[i] = b[i];
+    x->vector_buffer[i] = b->vector_buffer[i];
     for ( j=0; j<i; j++ ) {
-      x[i] = x[i] - A[i*n2+j]*x[j];
+      x->vector_buffer[i] = x->vector_buffer[i] - A[i*n2+j]*x->vector_buffer[j];
     }
   }
   // backward substitution with U
   for ( i=n2-1; i>=0; i-- ) {
     for ( j=i+1; j<n2; j++ ) {
-      x[i] = x[i] - A[i*n2+j]*x[j];
+      x->vector_buffer[i] = x->vector_buffer[i] - A[i*n2+j]*x->vector_buffer[j];
     }
-    x[i] = x[i]/A[i*(n2+1)];
+    x->vector_buffer[i] = x->vector_buffer[i]/A[i*(n2+1)];
   }
 }
 
 
-void coarse_LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION A, level_struct *l ) {
+void coarse_LU_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION A, level_struct *l ) {
   
   register int i, j, n2 = l->num_lattice_site_var;
   
   // y = Ax
   // multiplication with U
   for ( i=0; i<n2; i++ ) {
-    y[i] = A[i*(n2+1)]*x[i];
+    y->vector_buffer[i] = A[i*(n2+1)]*x->vector_buffer[i];
     for ( j=i+1; j<n2; j++ )
-      y[i] += A[i*n2+j]*x[j];
+      y->vector_buffer[i] += A[i*n2+j]*x->vector_buffer[j];
   }
   // multiplication with L
   for ( i=n2-1; i>0; i-- )
     for ( j=0; j<i; j++ )
-      y[i] += A[i*n2+j]*y[j];
+      y->vector_buffer[i] += A[i*n2+j]*y->vector_buffer[j];
 }
 
 
-void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
   int start, end;
   compute_core_start_end_custom( 0, op->num_even_sites, &start, &end, l, threading, 1 );
   // even sites
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
   coarse_self_couplings_PRECISION( y, x, op, start, end, l );
-#else
-  coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l );
-#endif
 }
 
-void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
   int start, end;
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION 
+
+  vector_PRECISION x_pt, y_pt;
   int num_site_var=l->num_lattice_site_var,
     oo_inv_size = SQUARE(num_site_var);
 #ifdef HAVE_TM1p1
@@ -317,72 +314,53 @@ void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_
 #else
   config_PRECISION sc = op->clover_oo_inv;
 #endif
-
+  
   compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 );
 
-  x += num_site_var*(op->num_even_sites+start);
-  y += num_site_var*(op->num_even_sites+start);  
+  x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start);
+  y_pt.vector_buffer = y->vector_buffer + num_site_var*(op->num_even_sites+start);  
   sc += oo_inv_size*start;
 
   for ( int i=start; i<end; i++ ) {
-    coarse_LU_multiply_PRECISION( y, x, sc, l );
-    x += num_site_var;
-    y += num_site_var;
+    coarse_LU_multiply_PRECISION( &y_pt, &x_pt, sc, l );
+    x_pt.vector_buffer += num_site_var;
+    y_pt.vector_buffer += num_site_var;
     sc += oo_inv_size;
   }
   
-#else
-  compute_core_start_end_custom( op->num_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 );
-  coarse_self_couplings_PRECISION_vectorized( y, x, op, start, end, l );
-#endif
 }
 
-void coarse_diag_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l ) {
+void coarse_diag_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l ) {
   
   coarse_diag_ee_PRECISION( y, x, op, l, no_threading );
   coarse_diag_oo_PRECISION( y, x, op, l, no_threading );
 }
 
-void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, 
+void coarse_diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, 
                                level_struct *l, struct Thread *threading ) {
   
   int start, end;
+  vector_PRECISION x_pt, y_pt;
   compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 );
   
   // odd sites
   int num_site_var = l->num_lattice_site_var,
     oo_inv_size = SQUARE(num_site_var);
 
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
 #ifdef HAVE_TM1p1
   config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv;
 #else
   config_PRECISION sc = op->clover_oo_inv;
-#endif
-#else
-  int lda = SIMD_LENGTH_PRECISION*((num_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  oo_inv_size = 2*num_site_var*lda;
-#ifdef HAVE_TM1p1
-  OPERATOR_TYPE_PRECISION *sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv_vectorized:op->clover_oo_inv_vectorized;
-#else
-  OPERATOR_TYPE_PRECISION *sc = op->clover_oo_inv_vectorized;
-#endif
 #endif
 
-  x += num_site_var*(op->num_even_sites+start);
-  y += num_site_var*(op->num_even_sites+start);  
+  x_pt.vector_buffer = x->vector_buffer + num_site_var*(op->num_even_sites+start);
+  y_pt.vector_buffer = y->vector_buffer + num_site_var*(op->num_even_sites+start);  
   sc += oo_inv_size*start;
 
   for ( int i=start; i<end; i++ ) {
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
-    coarse_perform_fwd_bwd_subs_PRECISION( y, x, sc, l );
-#else
-    for(int j=0; j<num_site_var; j++)
-      y[j] = _COMPLEX_PRECISION_ZERO;
-    cgemv( num_site_var, sc, lda, (float *)x, (float *)y);
-#endif
-    x += num_site_var;
-    y += num_site_var;
+    coarse_perform_fwd_bwd_subs_PRECISION( &y_pt, &x_pt, sc, l );
+    x_pt.vector_buffer += num_site_var;
+    y_pt.vector_buffer += num_site_var;
     sc += oo_inv_size;
   }
 }
@@ -396,8 +374,6 @@ void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread
   coarse_operator_PRECISION_set_self_couplings( op, l, threading );
   compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1);
 
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
-
   int size = SQUARE(2*nv);
   for( int i=start; i<end; i++ )
     coarse_selfcoupling_LU_decomposition_PRECISION( op->clover_oo_inv+i*size, op, op->num_even_sites+i, l );
@@ -408,24 +384,6 @@ void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread
     coarse_selfcoupling_LU_doublet_decomposition_PRECISION( op->clover_doublet_oo_inv+i*size_doublet, op, 
                                                             op->num_even_sites+i, l );
 #endif
-
-#else
-
-  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int size_v = 2*2*nv*column_offset;
-  for( int i=start; i<end; i++ )
-    cgem_inverse( 2*nv, op->clover_oo_inv_vectorized + i*size_v, 
-                  op->clover_vectorized + (op->num_even_sites+i)*size_v, column_offset );
-
-#ifdef HAVE_TM1p1
-  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int size_doublet_v = 2*4*nv*column_doublet_offset;
-  for( int i=start; i<end; i++ )
-    cgem_inverse( 4*nv, op->clover_doublet_oo_inv_vectorized + i*size_doublet_v, 
-                  op->clover_doublet_vectorized + (op->num_even_sites+i)*size_doublet_v, column_doublet_offset );
-#endif
-
-#endif
 }
 
 void coarse_oddeven_PRECISION_set_couplings( level_struct *l, struct Thread *threading ) {
@@ -444,16 +402,15 @@ void coarse_oddeven_alloc_PRECISION( level_struct *l ) {
   operator_PRECISION_alloc( op, _ODDEVEN, l );
 
   // buffers
-  MALLOC( op->buffer, complex_PRECISION*, 2 );
-  op->buffer[0] = NULL;
+  MALLOC( op->buffer, vector_PRECISION, 2 );
+  for (int k=0; k<2; k++ ){
+    vector_PRECISION_init( &(op->buffer[k]) );
 #ifdef HAVE_TM1p1
-  MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size );
-  op->buffer[1] = op->buffer[0] + 2*l->vector_size;  
+    vector_PRECISION_alloc( &(op->buffer[k]), _ORDINARY, 2, l, no_threading );
 #else
-  MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size );
-  op->buffer[1] = op->buffer[0] + l->vector_size;  
+    vector_PRECISION_alloc( &(op->buffer[k]), _ORDINARY, 1, l, no_threading );
 #endif
-
+  }
   for ( mu=0; mu<4; mu++ ) {
     le[mu] = l->local_lattice[mu];
     N[mu] = le[mu]+1;
@@ -479,23 +436,10 @@ void coarse_oddeven_alloc_PRECISION( level_struct *l ) {
           }
         }
   
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
-
   MALLOC( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites );
 #ifdef HAVE_TM1p1
   MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites );
 #endif
-
-#else
-  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION );
-#ifdef HAVE_TM1p1
-  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION );
-#endif
-
-#endif
-
   // define data layout
   eot = op->index_table;
   define_eot( eot, N, l );
@@ -601,39 +545,22 @@ void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder,
 
 void coarse_oddeven_free_PRECISION( level_struct *l ) {
   
-  int nv = l->num_parent_eig_vect, vs = l->vector_size;
+  int nv = l->num_parent_eig_vect;
   operator_PRECISION_struct *op = &(l->oe_op_PRECISION);
 
   operator_PRECISION_free( op, _ODDEVEN, l );
-  coarse_operator_PRECISION_free_vectorized( op, l );
-
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
 
   FREE( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites );
 #ifdef HAVE_TM1p1
   FREE( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites );
 #endif
-
-#else
-  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites );
-#ifdef HAVE_TM1p1
-  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites );
-#endif
-
-#endif
-  
-#ifdef HAVE_TM1p1
-  FREE( op->buffer[0], complex_PRECISION, 4*vs );
-#else
-  FREE( op->buffer[0], complex_PRECISION, 2*vs );
-#endif
-  FREE( op->buffer, complex_PRECISION*, 2 );
+  for (int k=0; k<2; k++ )
+    vector_PRECISION_free( &(op->buffer[k]), l, no_threading );
+  FREE( op->buffer, vector_PRECISION, 2 );
 }
 
 
-void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
+void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op,
                                     const int amount, level_struct *l, struct Thread *threading ) {
 
   START_NO_HYPERTHREADS(threading)
@@ -646,6 +573,9 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o
   vector_PRECISION in_pt, out_pt;
   config_PRECISION D_pt;
 
+  in_pt = *in;
+  out_pt = *out;
+
   int core_start;
   int core_end;
   
@@ -665,7 +595,7 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o
   if ( op->c.comm ) {
     for ( mu=0; mu<4; mu++ ) {
       // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
+      ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l );
     }
   }
   END_MASTER(threading)
@@ -681,49 +611,49 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o
   // compute U_mu^dagger coupling
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T];
+    coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   SYNC_CORES(threading)
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z];
+    coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   SYNC_CORES(threading)
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y];
+    coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   SYNC_CORES(threading)
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X];
+    coarse_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   
   START_LOCKED_MASTER(threading)
   if ( op->c.comm ) {
     for ( mu=0; mu<4; mu++ ) {
       // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );    
+      ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l );    
     }
     for ( mu=0; mu<4; mu++ ) {
       // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );    
+      ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l );    
     }
   }
   END_LOCKED_MASTER(threading)
@@ -738,30 +668,30 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o
   // compute U_mu couplings
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index];
     index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T];
+    coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
     
     D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z];
+    coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
     
     D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y];
+    coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
     
     D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X];
+    coarse_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   
   START_LOCKED_MASTER(threading)
   if ( op->c.comm ) {
     for ( mu=0; mu<4; mu++ ) {
       // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
+      ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l );
     }
   }
   END_LOCKED_MASTER(threading)
@@ -770,18 +700,9 @@ void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, o
 }
 
 
-void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
+void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op,
                                       const int amount, level_struct *l, struct Thread *threading ) {
 
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-#ifndef COMM_HIDING_COARSEOP
-  int sign = -1;
-  coarse_pn_hopping_term_PRECISION_vectorized( out, in, op, amount, l, sign, threading);
-#else
-  coarse_n_hopping_term_PRECISION_vectorized( out, in, op, amount, l, threading );
-#endif
-  return;
-#else
   START_NO_HYPERTHREADS(threading)
 
   int mu, i, index, num_site_var=l->num_lattice_site_var,
@@ -791,6 +712,8 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in,
       plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
   vector_PRECISION in_pt, out_pt;
   config_PRECISION D_pt;
+  in_pt = *in;
+  out_pt = *out;
 
   int core_start;
   int core_end;
@@ -811,7 +734,7 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in,
   if ( op->c.comm ) {
     for ( mu=0; mu<4; mu++ ) {
       // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
+      ghost_sendrecv_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l );
     }
   }
   END_MASTER(threading)
@@ -827,49 +750,49 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in,
   // compute U_mu^dagger coupling
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+T];
+    coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   SYNC_CORES(threading)
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Z];
+    coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   SYNC_CORES(threading)
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+Y];
+    coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   SYNC_CORES(threading)
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var;
     index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index+X];
+    coarse_n_daggered_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   
   START_LOCKED_MASTER(threading)
   if ( op->c.comm ) {
     for ( mu=0; mu<4; mu++ ) {
       // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );    
+      ghost_sendrecv_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l );    
     }
     for ( mu=0; mu<4; mu++ ) {
       // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );    
+      ghost_wait_PRECISION( in->vector_buffer, mu, -1, &(op->c), minus_dir_param, l );    
     }
   }
   END_LOCKED_MASTER(threading)
@@ -884,444 +807,35 @@ void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in,
   // compute U_mu couplings
   for ( i=core_start; i<core_end; i++ ) {
     index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
+    out_pt.vector_buffer = out->vector_buffer + num_site_var*op->neighbor_table[index];
     D_pt = op->D + num_4link_var*op->neighbor_table[index];
     index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+T];
+    coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
     
     D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Z];
+    coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
     
     D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+Y];
+    coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
     
     D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    in_pt.vector_buffer = in->vector_buffer + num_site_var*op->neighbor_table[index+X];
+    coarse_n_hopp_PRECISION( &out_pt, &in_pt, D_pt, l );
   }
   
   START_LOCKED_MASTER(threading)
   if ( op->c.comm ) {
     for ( mu=0; mu<4; mu++ ) {
       // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
+      ghost_wait_PRECISION( out->vector_buffer, mu, +1, &(op->c), plus_dir_param, l );
     }
   }
   END_LOCKED_MASTER(threading)
 
   END_NO_HYPERTHREADS(threading)
-#endif
-}
-
-
-void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                    const int amount, level_struct *l, struct Thread *threading ) {
-
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, index, num_site_var=l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-
-  OPERATOR_TYPE_PRECISION *D_vectorized;
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*2*l->num_parent_eig_vect*column_offset;
-
-  int core_start;
-  int core_end;
-
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  set_boundary_PRECISION( out, 0, l, threading );
-
-  if ( amount == _EVEN_SITES ) {
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
-
-  START_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // compute U_mu^dagger coupling
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // compute U_mu couplings
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index];
-    index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    D_vectorized += vectorized_link_offset;
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    D_vectorized += vectorized_link_offset;
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-
-  END_NO_HYPERTHREADS(threading)
-#endif
-}
-
-
-void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                    const int amount, level_struct *l, int sign, struct Thread *threading ) {
-
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, num_site_var=l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-
-  OPERATOR_TYPE_PRECISION *D_vectorized;
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int link_offset = 2*2*l->num_parent_eig_vect*column_offset;
-  int *neighbor_fw = op->neighbor_table;
-  int *neighbor_bw = op->backward_neighbor_table;
-
-  int core_start;
-  int core_end;
-
-  void (*coarse_hopp)(vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, level_struct *l);
-  if(sign == +1)
-    coarse_hopp = coarse_hopp_PRECISION_vectorized;
-  else
-    coarse_hopp = coarse_n_hopp_PRECISION_vectorized;
-
-
-  if ( l->num_processes > 1 && op->c.comm ) {
-    set_boundary_PRECISION( out, 0, l, threading );
-
-    if ( amount == _EVEN_SITES ) {
-      minus_dir_param = _ODD_SITES;
-      plus_dir_param = _EVEN_SITES;
-    } else if ( amount == _ODD_SITES ) {
-      minus_dir_param = _EVEN_SITES;
-      plus_dir_param = _ODD_SITES;
-    }
-
-    START_MASTER(threading)
-    for ( mu=0; mu<4; mu++ ) {
-      // send in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-    END_MASTER(threading)
-
-    if ( amount == _EVEN_SITES ) {
-      start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-    } else if ( amount == _ODD_SITES ) {
-      start = 0; num_lattice_sites = op->num_even_sites;
-    }
-    compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-    // prepare for sending to fw: compute hopping terms into forward boundary buffer
-    for ( i=core_start; i<core_end; i++ ) {
-      for(int mu=0; mu<4; mu++) {
-        if(neighbor_fw[5*i+1+mu] < l->num_inner_lattice_sites)
-          continue;
-        out_pt = out + num_site_var*neighbor_fw[5*i+1+mu];
-        in_pt = in + num_site_var*neighbor_fw[5*i];
-        D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset;
-        coarse_hopp( out_pt, in_pt, D_vectorized, l );
-      }
-    }
-    START_LOCKED_MASTER(threading)
-    for ( mu=0; mu<4; mu++ ) {
-      // send in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-    END_LOCKED_MASTER(threading)
-  }
-  else
-    SYNC_CORES(threading)
-
-
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  for ( i=core_start; i<core_end; i++ ) {
-    out_pt = out + num_site_var*neighbor_fw[5*i];
-
-    // U_mu^dagger coupling
-    for(int mu=0; mu<4; mu++) {
-      // terms coming from backward boundary buffer are done by the ghost_wait_PRECISION call below
-      if(neighbor_bw[5*i+1+mu] >= l->num_inner_lattice_sites)
-        continue;
-      D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_bw[5*i+1+mu] + mu*link_offset;
-      in_pt = in + num_site_var*neighbor_bw[5*i+1+mu];
-      coarse_hopp( out_pt, in_pt, D_vectorized, l );
-    }
-
-    // compute U_mu couplings
-    for(int mu=0; mu<4; mu++) {
-      D_vectorized = op->D_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset;
-      in_pt = in + num_site_var*neighbor_fw[5*i+1+mu];
-      coarse_hopp( out_pt, in_pt, D_vectorized, l );
-    }
-  }
-
-
-  // wait for terms from bw and add them
-  if ( l->num_processes > 1 && op->c.comm ) {
-    START_LOCKED_MASTER(threading)
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    END_LOCKED_MASTER(threading)
-  }
-  else
-    SYNC_CORES(threading)
-
-  END_NO_HYPERTHREADS(threading)
-#endif
-}
-
-
-void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, struct Thread *threading ) {
-
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, index, num_site_var=l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-
-  OPERATOR_TYPE_PRECISION *D_vectorized;
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*2*l->num_parent_eig_vect*column_offset;
-
-  int core_start;
-  int core_end;
-
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  set_boundary_PRECISION( out, 0, l, threading );
-
-  if ( amount == _EVEN_SITES ) {
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
-
-  START_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // D is applied in an input-centric way
-  // this makes threading a bit ugly, is there a better way?
-  // compute U_mu^dagger coupling
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // compute U_mu couplings
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index];
-    index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-
-  END_NO_HYPERTHREADS(threading)
-#endif
 }
 
 
@@ -1329,26 +843,26 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS
   
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
-  coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
+  coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading );
   PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, l, threading );
+  coarse_n_hopping_term_PRECISION( &p->b, &p->x, op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   
   fgmres_PRECISION( p, l, threading );
   
   // even to odd
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, l, threading );
+  coarse_n_hopping_term_PRECISION( &p->b, &p->x, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   PROF_PRECISION_START( _SC, threading );
-  coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
+  coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading );
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
 }
 
 
-void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
     
   // start and end indices for vector functions depending on thread
   int start;
@@ -1364,16 +878,16 @@ void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECI
   coarse_diag_ee_PRECISION( out, in, op, l, threading );
   PROF_PRECISION_STOP( _SC, 0, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start, end, l );
+  vector_PRECISION_define( &tmp[0], 0, start, end, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  coarse_hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   PROF_PRECISION_START( _SC, threading );
-  coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading );
+  coarse_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, threading );
   PROF_PRECISION_STOP( _SC, 1, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading );
+  coarse_n_hopping_term_PRECISION( out, &tmp[1], op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
 }
 
@@ -1387,47 +901,47 @@ void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PR
   vector_PRECISION tmp = op->buffer[0];
   
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp, 0, start_even, end_even, l );
+  vector_PRECISION_define( &tmp, 0, start_even, end_even, l );
   
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
-  coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l );
+  coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l );
   SYNC_CORES(threading)
-  coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
+  coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading );
   SYNC_CORES(threading)
-  coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l );
+  coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l );
   PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, l, threading );
+  coarse_n_hopping_term_PRECISION( &tmp, &p->x, op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
-  coarse_gamma5_PRECISION( tmp, tmp, start_even, end_even, l );
+  coarse_gamma5_PRECISION( &tmp, &tmp, start_even, end_even, l );
   SYNC_CORES(threading)
-  vector_PRECISION_plus( p->b, p->b, tmp, start_even, end_even, l );
+  vector_PRECISION_plus( &p->b, &p->b, &tmp, start_even, end_even, l );
   
   fgmres_PRECISION( p, l, threading );
   SYNC_CORES(threading)
-  coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l );
+  coarse_gamma5_PRECISION( &p->b, &p->b, start_odd, end_odd, l );
   SYNC_CORES(threading)
-  coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
+  coarse_diag_oo_inv_PRECISION( &p->x, &p->b, op, l, threading );
   SYNC_CORES(threading)
   
   // even to odd
   PROF_PRECISION_START( _NC, threading );
-  vector_PRECISION_define( tmp, 0, start_odd, end_odd, l );
+  vector_PRECISION_define( &tmp, 0, start_odd, end_odd, l );
   SYNC_CORES(threading)
-  coarse_n_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading );
+  coarse_n_hopping_term_PRECISION( &tmp, &p->x, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
-  coarse_diag_oo_inv_PRECISION( p->b, tmp, op, l, threading );
-  vector_PRECISION_plus( p->x, p->x, p->b, start_odd, end_odd, l );
+  coarse_diag_oo_inv_PRECISION( &p->b, &tmp, op, l, threading );
+  vector_PRECISION_plus( &p->x, &p->x, &p->b, start_odd, end_odd, l );
   
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
 }
 
 
-void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
     
   int start_even, end_even, start_odd, end_odd;
   compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var );
@@ -1440,16 +954,16 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P
   coarse_diag_ee_PRECISION( out, in, op, l, threading );
   PROF_PRECISION_STOP( _SC, 0, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l );
+  vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  coarse_hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   PROF_PRECISION_START( _SC, threading );
-  coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading );
+  coarse_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, threading );
   PROF_PRECISION_STOP( _SC, 1, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading );
+  coarse_n_hopping_term_PRECISION( out, &tmp[1], op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   SYNC_CORES(threading)
   coarse_gamma5_PRECISION( out, out, start_even, end_even, l );
@@ -1457,52 +971,55 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P
 }
 
 
-void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) {
+void coarse_odd_even_PRECISION_test( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) {
   
   if ( g.odd_even ) {
-    vector_PRECISION buf1 = NULL, buf2 = NULL;
-    
-    PUBLIC_MALLOC( buf1, complex_PRECISION, 2*l->vector_size );
-    buf2 = buf1 + l->vector_size;
+    vector_PRECISION buf[2];
 
+    for(int i=0; i<2; i++){
+      vector_PRECISION_init( &buf[i] );
+      vector_PRECISION_alloc( &buf[i], _ORDINARY, 1, l, threading );
+    }
+    
     START_LOCKED_MASTER(threading)
     // transformation part
-    vector_PRECISION_copy( buf1, in, 0, l->inner_vector_size, l );
+    vector_PRECISION_copy( &buf[0], in, 0, l->inner_vector_size, l );
     // even to odd
     vector_PRECISION_define( out, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
     END_LOCKED_MASTER(threading)
 
-    coarse_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading );
-    coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading );
+    coarse_hopping_term_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), _ODD_SITES, l, threading );
+    coarse_diag_oo_inv_PRECISION( &buf[1], out, &(l->oe_op_PRECISION), l, threading );
 
     START_LOCKED_MASTER(threading)
-    vector_PRECISION_plus( buf1, buf1, buf2, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
+    vector_PRECISION_plus( &buf[0], &buf[0], &buf[1], l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
     END_LOCKED_MASTER(threading)
     
     // block diagonal part
     if ( g.method == 6 ) {
-      g5D_coarse_apply_schur_complement_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading );
+      g5D_coarse_apply_schur_complement_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading );
     } else {
-      coarse_apply_schur_complement_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading );
+      coarse_apply_schur_complement_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading );
     }
     
-    coarse_diag_oo_PRECISION( out, buf1, &(l->oe_op_PRECISION), l, threading );
+    coarse_diag_oo_PRECISION( out, &buf[0], &(l->oe_op_PRECISION), l, threading );
     
     // back transformation part
-    coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading );
+    coarse_diag_oo_inv_PRECISION( &buf[1], out, &(l->oe_op_PRECISION), l, threading );
     
     if ( g.method == 6 ) {
       START_LOCKED_MASTER(threading)
       coarse_gamma5_PRECISION( out, out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
-      vector_PRECISION_define( buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
-      coarse_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading );
-      coarse_gamma5_PRECISION( buf1, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
-      vector_PRECISION_plus( out, out, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
+      vector_PRECISION_define( &buf[0], 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
+      coarse_hopping_term_PRECISION( &buf[0], &buf[1], &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading );
+      coarse_gamma5_PRECISION( &buf[0], &buf[0], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
+      vector_PRECISION_plus( out, out, &buf[0], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
       END_LOCKED_MASTER(threading)
     } else {
-      coarse_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading );
+      coarse_hopping_term_PRECISION( out, &buf[1], &(l->oe_op_PRECISION), _EVEN_SITES, l, threading );
     }
 
-    PUBLIC_FREE( buf1, complex_PRECISION, 2*l->vector_size );
+    for(int i=0; i<2; i++)
+      vector_PRECISION_free( &buf[i], l, threading );                      
   }
 }
diff --git a/src/coarse_oddeven_generic.h b/src/coarse_oddeven_generic.h
index e1481be..ec33b23 100644
--- a/src/coarse_oddeven_generic.h
+++ b/src/coarse_oddeven_generic.h
@@ -34,30 +34,24 @@
   
   void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, 
                                         struct Thread *threading );
-  void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in,
+  void coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in,
                                           operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op,
                                             level_struct *l, struct Thread *threading );
-  void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in,
+  void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in,
                                            operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   
-  void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
+  void coarse_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op,
                                       const int amount, level_struct *l, struct Thread *threading );
-  void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                        const int amount, level_struct *l, struct Thread *threading );
-  void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, struct Thread *threading );
-  void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, int sign, struct Thread *threading );
-  void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
+  void coarse_n_hopping_term_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op,
                                         const int amount, level_struct *l, struct Thread *threading );
   
-  void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1,
+  void coarse_odd_even_PRECISION_test( vector_PRECISION *c4, vector_PRECISION *c1,
                                        level_struct *l, struct Thread *threading );
 
-  void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void coarse_diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
 
-  void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void coarse_diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
 
   
 #endif
diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c
index 33338d3..641d9af 100644
--- a/src/coarse_operator_generic.c
+++ b/src/coarse_operator_generic.c
@@ -37,32 +37,6 @@ void coarse_operator_PRECISION_free( level_struct *l ) {
   
   operator_PRECISION_free( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level );
   
-  coarse_operator_PRECISION_free_vectorized( &(l->next_level->s_PRECISION.op), l->next_level );
-}
-
-void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ) {
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  if( op->D_vectorized != NULL ) {
-    int n2 = (l->depth>0 && l->level>0) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites;
-    int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    // 2 is for complex, 4 is for 4 directions
-    FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 );
-    FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 );
-  }
-#endif
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  if( op->clover_vectorized != NULL ) {
-    int n = l->num_inner_lattice_sites;
-    int column_offset = SIMD_LENGTH_PRECISION*((2*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*2*l->num_parent_eig_vect*column_offset*n );
-#ifdef HAVE_TM1p1
-    int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    FREE_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_parent_eig_vect*column_doublet_offset*n );
-#endif
-  }
-#endif
 }
 
 void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
@@ -70,7 +44,8 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
   double t0, t1;
   t0 = MPI_Wtime();
   
-  vector_PRECISION buffer1 = l->vbuf_PRECISION[4], buffer2 = l->vbuf_PRECISION[5];
+  vector_PRECISION buffer1, buffer2;
+  buffer1.vector_buffer = l->vbuf_PRECISION[4].vector_buffer; buffer2.vector_buffer = l->vbuf_PRECISION[5].vector_buffer;
   
   int mu, n = l->num_eig_vect, i, j,
     D_size = l->next_level->D_size,
@@ -93,22 +68,22 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
   for ( i=0; i<n; i++ ) {
     for ( mu=0; mu<4; mu++ ) {
       // update ghost cells of V[i]
-      negative_sendrecv_PRECISION( V[i], mu, &(l->s_PRECISION.op.c), l );
+      negative_sendrecv_PRECISION( &V[i], mu, &(l->s_PRECISION.op.c), l );
     }
     // apply self coupling of block-and-2spin-restricted dirac operator for each aggregate
-    aggregate_self_coupling( buffer1, buffer2, V[i], &(l->s_PRECISION), l );
+    aggregate_self_coupling( &buffer1, &buffer2, &V[i], &(l->s_PRECISION), l );
     // calculate selfcoupling entries of the coarse grid operator
-    set_coarse_self_coupling_PRECISION( buffer1, buffer2, V, i, l );
+    set_coarse_self_coupling_PRECISION( &buffer1, &buffer2, V, i, l );
     //odd_proj
-    aggregate_block( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l );
-    set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l );
+    aggregate_block( &buffer1, &buffer2, &V[i], l->s_PRECISION.op.odd_proj, l );
+    set_block_diagonal_PRECISION( &buffer1, &buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l );
  
     for ( mu=0; mu<4; mu++ ) {
       // finish updating ghostcells of V[i]
       negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l );      
       // apply 2spin-restricted dirac operator for direction mu for all aggregates
-      aggregate_neighbor_coupling( buffer1, buffer2, V[i], mu, &(l->s_PRECISION), l );      
-      set_coarse_neighbor_coupling_PRECISION( buffer1, buffer2, V, mu, i, l );
+      aggregate_neighbor_coupling( &buffer1, &buffer2, &V[i], mu, &(l->s_PRECISION), l );      
+      set_coarse_neighbor_coupling_PRECISION( &buffer1, &buffer2, V, mu, i, l );
     }
   }
   
@@ -129,7 +104,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t
   PRECISION mf = (g.mu_factor[l->depth]) ? g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth]:0;
   if ( mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_even_shift == 0 &&
        mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_odd_shift == 0 )
-    vector_PRECISION_define( l->next_level->op_PRECISION.tm_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level );
+    buffer_PRECISION_define( l->next_level->op_PRECISION.tm_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level );
   else
     tm_term_PRECISION_setup( mf*l->s_PRECISION.op.mu, mf*l->s_PRECISION.op.mu_even_shift,
                              mf*l->s_PRECISION.op.mu_odd_shift, &(l->next_level->op_PRECISION),
@@ -140,7 +115,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t
   PRECISION ef = (g.epsbar_factor[l->depth]) ? g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth]:0; 
   if ( ef*l->s_PRECISION.op.epsbar == 0 &&  ef*l->s_PRECISION.op.epsbar_ig5_even_shift == 0 &&
        ef*l->s_PRECISION.op.epsbar_ig5_odd_shift == 0 )
-    vector_PRECISION_define( l->next_level->op_PRECISION.epsbar_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level );
+    buffer_PRECISION_define( l->next_level->op_PRECISION.epsbar_term, _COMPLEX_double_ZERO, 0, block_size, l->next_level );
   else
     epsbar_term_PRECISION_setup( ef*l->s_PRECISION.op.epsbar, ef*l->s_PRECISION.op.epsbar_ig5_even_shift,
                                  ef*l->s_PRECISION.op.epsbar_ig5_odd_shift, &(l->next_level->op_PRECISION),
@@ -149,7 +124,7 @@ void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *t
 
 }
 
-void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, 
+void set_block_diagonal_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, 
                                    vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) {
   
   // U(x) = [ A 0      , A=A*, D=D*
@@ -162,16 +137,16 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION s
     aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, 
     offset = l->num_parent_eig_vect,
     block_site_size = (num_eig_vect*(num_eig_vect+1));
-  vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
+  buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
   config_PRECISION block_pt;
 
   for ( k=0; k<=n; k++ ) {
     k1 = (n*(n+1))/2+k; k2 = (n*(n+1))/2+k+block_site_size/2;
   
     for ( j=0; j<num_aggregates; j++ ) {
-      spin_0_1_pt = spin_0_1 + j*aggregate_size;
-      spin_2_3_pt = spin_2_3 + j*aggregate_size;
-      interpolation_data = V[k] + j*aggregate_size;
+      spin_0_1_pt = spin_0_1->vector_buffer + j*aggregate_size;
+      spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size;
+      interpolation_data = V[k].vector_buffer + j*aggregate_size;
       block_pt = block + j*block_site_size;
       
       for ( i=0; i<aggregate_size; ) {
@@ -186,7 +161,7 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION s
   }
 }
 
-void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, 
+void set_coarse_self_coupling_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, 
                                          vector_PRECISION *V, const int n, level_struct *l ) {
   
   int i, j, k, m, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
@@ -194,7 +169,7 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI
     aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates,
     offset = l->num_parent_eig_vect,
     clover_site_size = (num_eig_vect*(2*num_eig_vect+1));
-  vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
+  buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
   config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover;  
   
   // U(x) = [ A B      , A=A*, D=D*, C = -B*
@@ -205,9 +180,9 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI
     k1 = (n*(n+1))/2+k; k2 = (n*(n+1))/2+k+(num_eig_vect*(num_eig_vect+1))/2;
   
     for ( j=0; j<num_aggregates; j++ ) {
-      spin_0_1_pt = spin_0_1 + j*aggregate_size;
-      spin_2_3_pt = spin_2_3 + j*aggregate_size;
-      interpolation_data = V[k] + j*aggregate_size;
+      spin_0_1_pt = spin_0_1->vector_buffer + j*aggregate_size;
+      spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size;
+      interpolation_data = V[k].vector_buffer + j*aggregate_size;
       clover_pt = clover + j*clover_site_size;
       
       for ( i=0; i<aggregate_size; ) {
@@ -225,9 +200,9 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI
     k1 = num_eig_vect*(num_eig_vect+1+n) + k;
   
     for ( j=0; j<num_aggregates; j++ ) {
-      spin_0_1_pt = spin_0_1 + j*aggregate_size;
-      spin_2_3_pt = spin_2_3 + j*aggregate_size;
-      interpolation_data = V[k] + j*aggregate_size;
+      spin_0_1_pt = spin_0_1->vector_buffer + j*aggregate_size;
+      spin_2_3_pt = spin_2_3->vector_buffer + j*aggregate_size;
+      interpolation_data = V[k].vector_buffer + j*aggregate_size;
       clover_pt = clover + j*clover_site_size;
       
       for ( i=0; i<aggregate_size; ) {
@@ -241,7 +216,7 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI
 }
 
 
-void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, 
+void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, 
                                              vector_PRECISION *V, const int mu, const int n, level_struct *l ) {
   
   int i, i1, j, k, k1, k2, m, num_aggregates = l->is_PRECISION.num_agg,
@@ -250,7 +225,7 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P
       D_link_size = num_eig_vect*num_eig_vect*4, *index_dir = l->is_PRECISION.agg_boundary_index[mu],
       aggregate_boundary_sites = l->is_PRECISION.agg_boundary_length[mu]/num_aggregates;
       
-  vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
+  buffer_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
   config_PRECISION D_pt, D = l->next_level->op_PRECISION.D;
   
   // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
@@ -264,8 +239,8 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P
       D_pt = D+(j*4+mu)*D_link_size;
       
       for ( i=0; i<aggregate_boundary_sites; i++ ) {
-        spin_0_1_pt = spin_0_1 + nlsv*index_dir[i1];
-        interpolation_data = V[k] + nlsv*index_dir[i1]; i1++;
+        spin_0_1_pt = spin_0_1->vector_buffer + nlsv*index_dir[i1];
+        interpolation_data = V[k].vector_buffer + nlsv*index_dir[i1]; i1++;
         // A
         for ( m=0; m<offset; m++ )
           D_pt[ k1 ] += conj_PRECISION( interpolation_data[m] ) * spin_0_1_pt[m];
@@ -282,8 +257,8 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P
       D_pt = D+(j*4+mu)*D_link_size;
       
       for ( i=0; i<aggregate_boundary_sites; i++ ) {
-        spin_2_3_pt = spin_2_3 + nlsv*index_dir[i1];
-        interpolation_data = V[k] + nlsv*index_dir[i1]; i1++;
+        spin_2_3_pt = spin_2_3->vector_buffer + nlsv*index_dir[i1];
+        interpolation_data = V[k].vector_buffer + nlsv*index_dir[i1]; i1++;
         // B
         for ( m=0; m<offset; m++ )
           D_pt[ k1 ] += conj_PRECISION( interpolation_data[m] ) * spin_2_3_pt[m];
@@ -295,23 +270,20 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P
   }
 }
 
-void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
+void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
 
   int n = s->num_block_sites, *length = s->dir_length, **index = s->index,
     *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var, num_eig_vect = l->num_parent_eig_vect;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  
+  vector_PRECISION lphi, leta;
+  lphi.vector_buffer = phi->vector_buffer+start; leta.vector_buffer = eta->vector_buffer+start;
+  vector_PRECISION leta1=leta, leta2=leta, lphi1=lphi, lphi2=lphi;
+
   // site-wise self coupling
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
   coarse_self_couplings_PRECISION( eta, phi, &(s->op), (start/m), (start/m)+n, l);
-#else
-  coarse_self_couplings_PRECISION_vectorized( eta, phi, &(s->op), (start/m), (start/m)+n, l );
-#endif
 
   // inner block couplings
-#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
   int hopp_size = 4 * SQUARE( num_eig_vect*2 );
   config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size;
 
@@ -320,34 +292,20 @@ void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi
     for ( int i=0; i<length[mu]; i++ ) {
       int k = ind[i]; int j = neighbor[5*k+mu+1];
       D_pt = D + hopp_size*k + (hopp_size/4)*mu;
-      coarse_hopp_PRECISION( leta+m*k, lphi+m*j, D_pt, l );
-      coarse_daggered_hopp_PRECISION( leta+m*j, lphi+m*k, D_pt, l );
-    }
-  }
-#else
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*2*num_eig_vect*column_offset;
-  for ( int mu=0; mu<4; mu++ ) {
-    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized +
-      (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset;
-    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized +
-      (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset;
-    ind = index[mu]; // mu direction
-    for ( int i=0; i<length[mu]; i++ ) {
-      int k = ind[i]; int j = neighbor[5*k+mu+1];
-      // hopp
-      coarse_hopp_PRECISION_vectorized( leta+m*k, lphi+m*j, Dplus + 4*vectorized_link_offset*k, l );
-      // daggered hopp
-      coarse_hopp_PRECISION_vectorized( leta+m*j, lphi+m*k, Dminus + 4*vectorized_link_offset*k, l );
+      leta1.vector_buffer = leta.vector_buffer+m*k;
+      lphi1.vector_buffer = lphi.vector_buffer+m*j;
+      coarse_hopp_PRECISION( &leta1, &lphi1, D_pt, l );
+
+      leta2.vector_buffer = leta.vector_buffer+m*j;
+      lphi2.vector_buffer = lphi.vector_buffer+m*k;
+      coarse_daggered_hopp_PRECISION( &leta2, &lphi2, D_pt, l );
     }
   }
-#endif
-
   END_UNTHREADED_FUNCTION(threading)
 }
 
 
-void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
+void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi,
                                                 schwarz_PRECISION_struct *s, level_struct *l ) {
   
   int i, mu, index1, index2, length, *index_dir, *neighbor = s->op.neighbor_table,
@@ -363,16 +321,16 @@ void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PR
     length = l->is_PRECISION.agg_length[mu]; index_dir = l->is_PRECISION.agg_index[mu];
     for ( i=0; i<length; i++ ) {
       index1 = index_dir[i]; index2 = neighbor[5*index1+mu+1]; D_pt = D + Dss*index1 + Dls*mu;
-      phi_pt = phi + n*index2; eta1_pt = eta1 + n*index1; eta2_pt = eta2 + n*index1;
-      coarse_spinwise_n_hopp_PRECISION( eta1_pt, eta2_pt, phi_pt, D_pt, l );
-      phi_pt = phi + n*index1; eta1_pt = eta1 + n*index2; eta2_pt = eta2 + n*index2;
-      coarse_spinwise_n_daggered_hopp_PRECISION( eta1_pt, eta2_pt, phi_pt, D_pt, l );
+      phi_pt.vector_buffer = phi->vector_buffer + n*index2; eta1_pt.vector_buffer = eta1->vector_buffer + n*index1; eta2_pt.vector_buffer = eta2->vector_buffer + n*index1;
+      coarse_spinwise_n_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l );
+      phi_pt.vector_buffer = phi->vector_buffer + n*index1; eta1_pt.vector_buffer = eta1->vector_buffer + n*index2; eta2_pt.vector_buffer = eta2->vector_buffer + n*index2;
+      coarse_spinwise_n_daggered_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l );
     }
   }
 }
 
 
-void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
+void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi,
                                                     const int mu, schwarz_PRECISION_struct *s, level_struct *l ) {
   
   int i, index1, index2, length = l->is_PRECISION.agg_boundary_length[mu],
@@ -390,12 +348,12 @@ void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vecto
     index1 = index_dir[i];
     index2 = neighbor[i];
     D_pt = D + Dss*index1 + Dls*mu;
-    phi_pt = phi + n*index2; eta1_pt = eta1 + n*index1; eta2_pt = eta2 + n*index1;
-    coarse_spinwise_hopp_PRECISION( eta1_pt, eta2_pt, phi_pt, D_pt, l );
+    phi_pt.vector_buffer = phi->vector_buffer + n*index2; eta1_pt.vector_buffer = eta1->vector_buffer + n*index1; eta2_pt.vector_buffer = eta2->vector_buffer + n*index1;
+    coarse_spinwise_hopp_PRECISION( &eta1_pt, &eta2_pt, &phi_pt, D_pt, l );
   }
 }
 
-void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+void coarse_self_couplings_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                       operator_PRECISION_struct *op, int start, int end, level_struct *l ) {
 
   int num_eig_vect = l->num_parent_eig_vect, 
@@ -419,39 +377,40 @@ void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi
 
 }
 
-void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
+void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi,
                                                 config_PRECISION block, level_struct *l ) {
   int length = l->inner_vector_size,
     num_eig_vect = l->num_parent_eig_vect,
     block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
   config_PRECISION block_pt = block;
-  vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length;
+  vector_PRECISION phi_pt=*phi, eta1_pt=*eta1, eta2_pt=*eta2, phi_end_pt;
+  phi_end_pt.vector_buffer=phi->vector_buffer+length;
   // U(x) = [ A 0      , A=A*, D=D* 
   //          0 D ]
   // storage order: upper triangle of A, upper triangle of D, columnwise
   // diagonal coupling
-  while ( phi_pt < phi_end_pt ) {
+  while ( phi_pt.vector_buffer< phi_end_pt.vector_buffer ) {
     // A
-    mvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect );
-    vector_PRECISION_define( eta2_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l );
-    block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect;
+    mvp_PRECISION( eta1_pt.vector_buffer, block_pt, phi_pt.vector_buffer, num_eig_vect );
+    vector_PRECISION_define( &eta2_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l );
+    block_pt += block_step_size; eta1_pt.vector_buffer += num_eig_vect; eta2_pt.vector_buffer += num_eig_vect; phi_pt.vector_buffer += num_eig_vect;
     // D
-    vector_PRECISION_define( eta1_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l );
-    mvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect );
-    block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect;
+    vector_PRECISION_define( &eta1_pt, _COMPLEX_PRECISION_ZERO, 0, num_eig_vect, l );
+    mvp_PRECISION( eta2_pt.vector_buffer, block_pt, phi_pt.vector_buffer, num_eig_vect );
+    block_pt += block_step_size; eta1_pt.vector_buffer += num_eig_vect; eta2_pt.vector_buffer += num_eig_vect; phi_pt.vector_buffer += num_eig_vect;
   }
 }
 
 
 
-void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
+void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi,
                                                config_PRECISION clover, int length, level_struct *l ) {
   
   int num_eig_vect = l->num_parent_eig_vect,
       clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2,
       clover_step_size2 = SQUARE(num_eig_vect);
   config_PRECISION clover_pt = clover;
-  vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2+num_eig_vect, phi_end_pt=phi+length;
+  buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer+num_eig_vect, phi_end_pt=phi->vector_buffer+length;
   // U(x) = [ A B      , A=A*, D=D*, C = -B*
   //          C D ]
   // storage order: upper triangle of A, upper triangle of D, B, columnwise
@@ -483,181 +442,77 @@ void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, lev
 void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l,
                                                        struct Thread *threading ) {
 
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-  int nc_size = SQUARE(l->num_parent_eig_vect*2);
-  int n1, n2;
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); 
-  int offset_v = 4*l->num_parent_eig_vect*column_offset;
-  
-  if ( l->depth > 0 && l->level>0 ) {
-    n1 = l->num_lattice_sites;
-    n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites;
-  } else {
-    n1 = l->num_inner_lattice_sites;
-    n2 = l->num_inner_lattice_sites;
-  }
-  int start, end;
-  compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1);
-  int n_per_core = end-start;
-  START_LOCKED_MASTER(threading)
-  if( op->D_vectorized == NULL ) {
-    // 2 is for complex, 4 is for 4 directions
-    MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 );
-    MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 );
-  }
-  END_LOCKED_MASTER(threading)
-
-  copy_coarse_operator_to_vectorized_layout_PRECISION(
-      op->D + 4*start*nc_size,
-      op->D_vectorized + 4*start*offset_v,
-      n_per_core, l->num_parent_eig_vect);
-  copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
-      op->D + 4*start*nc_size,
-      op->D_transformed_vectorized + 4*start*offset_v,
-      n_per_core, l->num_parent_eig_vect);
-  // vectorize negative boundary
-  if ( n2>n1 ) {
-    compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1);
-    n_per_core = end-start;
-    copy_coarse_operator_to_vectorized_layout_PRECISION(
-        op->D + 4*start*nc_size,
-        op->D_vectorized + 4*start*offset_v,
-        n_per_core, l->num_parent_eig_vect);
-    copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
-        op->D + 4*start*nc_size,
-        op->D_transformed_vectorized + 4*start*offset_v,
-        n_per_core, l->num_parent_eig_vect);
-  }
-  SYNC_CORES(threading)
-#endif
-
 }
 
 void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, 
                                                    struct Thread *threading ) {
     
-#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
-  int n = l->num_inner_lattice_sites, nv = l->num_parent_eig_vect;
-  int sc_size = (nv)*(nv*2+1);
-  int start, end;
-  compute_core_start_end_custom(0, n, &start, &end, l, threading, 1);
-  int n_per_core = end-start;
-
-  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int offset_v = 2*2*nv*column_offset;
-  if( op->clover_vectorized == NULL ) {
-    START_LOCKED_MASTER(threading)
-    MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, offset_v*n, 64 );
-    END_LOCKED_MASTER(threading)
-  }
-  copy_coarse_operator_clover_to_vectorized_layout_PRECISION(
-      op->clover + start*sc_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, nv);
-#ifdef HAVE_TM
-  int tm_size = (nv)*(nv+1);
-  if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 )
-    add_tm_term_to_vectorized_layout_PRECISION( 
-        op->tm_term + start*tm_size,
-        op->clover_vectorized + start*offset_v,
-        n_per_core, nv);
-#endif
-
-#ifdef HAVE_TM1p1
-  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int offset_doublet_v = 2*4*nv*column_doublet_offset;
-  int eps_size = (nv)*(nv+1);
-  if( op->clover_doublet_vectorized == NULL ) {
-    START_LOCKED_MASTER(threading)
-    MALLOC_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, offset_doublet_v*n, 64 );
-    END_LOCKED_MASTER(threading)
-  }
-  copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(
-      op->clover + start*sc_size,
-      op->clover_doublet_vectorized + start*offset_doublet_v,
-      n_per_core, nv);
-  if ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 )
-    add_epsbar_term_to_doublet_vectorized_layout_PRECISION(
-        op->epsbar_term + start*eps_size,
-        op->clover_doublet_vectorized + start*offset_doublet_v,
-        n_per_core, nv);
-#ifdef HAVE_TM
-  if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 )
-    add_tm_term_to_doublet_vectorized_layout_PRECISION(
-        op->tm_term + start*tm_size,
-        op->clover_doublet_vectorized + start*offset_doublet_v,
-        n_per_core, nv);
-#endif
-#endif
-  SYNC_CORES(threading)
-#endif
 }
 
-void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) {
+void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) {
   
   int j, k=l->num_lattice_site_var/2;
-  vector_PRECISION eta_end;
+  buffer_PRECISION eta_end, eta_pt, phi_pt;
+  eta_end = eta->vector_buffer + end;
+  phi_pt = phi->vector_buffer + start;
+  eta_pt = eta->vector_buffer + start;
   
-  eta_end = eta+end;
-  phi += start;
-  eta += start;
-  
-  if ( eta != phi ) {
-    while ( eta < eta_end ) {
+  if ( eta_pt != phi_pt ) {
+    while ( eta_pt < eta_end ) {
       for ( j=0; j<k; j++ ) {
-        *eta = -(*phi);
-        eta++; phi++;
+        *eta_pt = -(*phi_pt);
+        eta_pt++; phi_pt++;
       }
       for ( j=0; j<k; j++ ) {
-        *eta = *phi;
-        eta++; phi++;
+        *eta_pt = *phi_pt;
+        eta_pt++; phi_pt++;
       }
     }
   } else {
-    while ( eta < eta_end ) {
+    while ( eta_pt < eta_end ) {
       for ( j=0; j<k; j++ ) {
-        *eta = -(*eta);
-        eta++;
+        *eta_pt = -(*eta_pt);
+        eta_pt++;
       }
-      eta+=k;
+      eta_pt+=k;
     }
   }
 }
 
-void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) {
+void coarse_tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) {
   
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
     int j, k=l->num_lattice_site_var/4;
-    vector_PRECISION eta_end;
+    buffer_PRECISION eta_end, phi_pt, eta_pt;
     
-    eta_end = eta+end;
-    phi += start;
-    eta += start;
+    eta_end = eta->vector_buffer + end;
+    phi_pt = phi->vector_buffer + start;
+    eta_pt = eta->vector_buffer + start;
     
-    ASSERT( eta != phi );
-    while ( eta < eta_end ) {
-      phi += k;
+    ASSERT( eta_pt != phi_pt );
+    while ( eta_pt < eta_end ) {
+      phi_pt += k;
       for ( j=0; j<k; j++ ) {
-        *eta = -(*phi);
-        eta++; phi++;
+        *eta_pt = -(*phi_pt);
+        eta_pt++; phi_pt++;
       }
-      phi -= 2*k;
+      phi_pt -= 2*k;
       for ( j=0; j<k; j++ ) {
-        *eta = -(*phi);
-        eta++; phi++;
+        *eta_pt = -(*phi_pt);
+        eta_pt++; phi_pt++;
       }
-      phi += 2*k;
+      phi_pt += 2*k;
       for ( j=0; j<k; j++ ) {
-        *eta = *phi;
-        eta++; phi++;
+        *eta_pt = *phi_pt;
+        eta_pt++; phi_pt++;
       }
-      phi -= 2*k;
+      phi_pt -= 2*k;
       for ( j=0; j<k; j++ ) {
-        *eta = *phi;
-        eta++; phi++;
+        *eta_pt = *phi_pt;
+        eta_pt++; phi_pt++;
       }
-      phi += k;
+      phi_pt += k;
     }
   } else 
 #endif
@@ -667,7 +522,7 @@ void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, i
     }
 }
 
-void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
+void apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op,
                                       level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _SC, threading );
@@ -675,25 +530,17 @@ void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi
   int end;
   compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &end, l, threading, 1);
 
-#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
   coarse_self_couplings_PRECISION( eta, phi, op, start, end, l);
-#else
-  coarse_self_couplings_PRECISION_vectorized( eta, phi, op, start, end, l );
-#endif
 
   PROF_PRECISION_STOP( _SC, 1, threading );
   PROF_PRECISION_START( _NC, threading );
 
-#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
   coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading );
-#else
-  coarse_hopping_term_PRECISION_vectorized( eta, phi, op, _FULL_SYSTEM, l, threading ); 
-#endif
 
   PROF_PRECISION_STOP( _NC, 1, threading );
 }
 
-void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
+void g5D_apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op,
                                           level_struct *l, struct Thread *threading ) {
   int start, end;
   compute_core_start_end_custom(0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var );
@@ -704,11 +551,11 @@ void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION
 }
 
 
-void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
+void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op,
                                              level_struct *l, struct Thread *threading ) {
   
-  coarse_gamma5_PRECISION( l->vbuf_PRECISION[3], phi, threading->start_index[l->depth], threading->end_index[l->depth], l );
-  apply_coarse_operator_PRECISION( eta, l->vbuf_PRECISION[3], op, l, threading );
+  coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[3]), phi, threading->start_index[l->depth], threading->end_index[l->depth], l );
+  apply_coarse_operator_PRECISION( eta, &(l->vbuf_PRECISION[3]), op, l, threading );
   coarse_gamma5_PRECISION( eta, eta, threading->start_index[l->depth], threading->end_index[l->depth], l );
 }
 
@@ -716,68 +563,61 @@ void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECIS
 void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading ) {
 
   if ( !l->idle ) {
-    int vs = l->vector_size, ivs = l->inner_vector_size,
-        cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size;
+    int ivs = l->inner_vector_size, civs = l->next_level->inner_vector_size;
     PRECISION diff = 0;
-    vector_PRECISION vp1=NULL, vp2, vp3, vp4, vc1=NULL, vc2, vc3;
+    vector_PRECISION vp[4], vc[3];
+    
+    for(int i=0; i<4; i++){
+      vector_PRECISION_init( &vp[i] );
+      vector_PRECISION_alloc( &vp[i], _ORDINARY, 1, l, threading );
+    }
 
-    PUBLIC_MALLOC( vp1, complex_PRECISION, 4*vs );
-    PUBLIC_MALLOC( vc1, complex_PRECISION, 3*cvs );
+    for(int i=0; i<3; i++){                                                               
+      vector_PRECISION_init( &vc[i] );                                                    
+      vector_PRECISION_alloc( &vc[i], _ORDINARY, 1, l->next_level, threading );                       
+    } 
     
     SYNC_MASTER_TO_ALL(threading)
     
-    vp2 = vp1 + vs; vp3 = vp2 + vs; vp4 = vp3 + vs; vc2 = vc1 + cvs; vc3 = vc2 + cvs; 
-
     START_LOCKED_MASTER(threading)
 #ifdef HAVE_TM1p1
     if(g.n_flavours == 1)
 #endif
     {
-#ifdef INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION
-      double norm = 0.0;
-      double dot = 0.0;
-      float *op = (float *)l->is_PRECISION.operator;
-      float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1;
-      for ( int i=0; i<l->inner_vector_size; i++ )
-        norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]);
-      for ( int i=0; i<l->inner_vector_size; i++ )
-        dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]);
-      diff = dot/norm;
-#else
-      diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading )
-        / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading );
-#endif
+      diff = global_inner_product_PRECISION( &(l->is_PRECISION.interpolation[0]), &(l->is_PRECISION.interpolation[1]), 0, ivs, l, no_threading )
+        / global_norm_PRECISION( &(l->is_PRECISION.interpolation[0]), 0, ivs, l, no_threading );
+      
       test0_PRECISION("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) );
     }
     
     if ( !l->next_level->idle )
-      vector_PRECISION_define_random( vc1, 0, civs, l->next_level );
-    vector_PRECISION_distribute( vc2, vc1, l->next_level );
-    vector_PRECISION_gather( vc3, vc2, l->next_level );
+      vector_PRECISION_define_random( &vc[0], 0, civs, l->next_level );
+    vector_PRECISION_distribute( &vc[1], &vc[0], l->next_level );
+    vector_PRECISION_gather( &vc[2], &vc[1], l->next_level );
     if ( !l->next_level->idle ) {
-      vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level );
-      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level );
+      diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
     }
     test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff );
         
     if ( !l->next_level->idle )
-      vector_PRECISION_define_random( vc1, 0, civs, l->next_level );
-    interpolate3_PRECISION( vp1, vc1, l, no_threading );
-    restrict_PRECISION( vc2, vp1, l, no_threading );
+      vector_PRECISION_define_random( &vc[0], 0, civs, l->next_level );
+    interpolate3_PRECISION( &vp[0], &vc[0], l, no_threading );
+    restrict_PRECISION( &vc[1], &vp[0], l, no_threading );
     if ( !l->next_level->idle ) {
-      vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level );
-      diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      vector_PRECISION_minus( &vc[2], &vc[0], &vc[1], 0, civs, l->next_level );
+      diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
       test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) );
     }    
       
     END_LOCKED_MASTER(threading)
     if(threading->n_core>1) {
-      interpolate3_PRECISION( vp1, vc1, l, threading );
-      restrict_PRECISION( vc2, vp1, l, threading );
+      interpolate3_PRECISION( &vp[0], &vc[0], l, threading );
+      restrict_PRECISION( &vc[1], &vp[0], l, threading );
       START_LOCKED_MASTER(threading)
       if ( !l->next_level->idle ) {
-        vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level );
-        diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+        vector_PRECISION_minus( &vc[2], &vc[0], &vc[1], 0, civs, l->next_level );
+        diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
         test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff );
       }
       END_LOCKED_MASTER(threading)
@@ -785,27 +625,27 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
 
     START_LOCKED_MASTER(threading)
     if (l->depth==0) 
-      gamma5_PRECISION( vp2, vp1, l, no_threading );
+      gamma5_PRECISION( &vp[1], &vp[0], l, no_threading );
     else
-      coarse_gamma5_PRECISION( vp2, vp1, 0, ivs, l );
-    restrict_PRECISION( vc2, vp2, l, no_threading );
-    coarse_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level );
+      coarse_gamma5_PRECISION( &vp[1], &vp[0], 0, ivs, l );
+    restrict_PRECISION( &vc[1], &vp[1], l, no_threading );
+    coarse_gamma5_PRECISION( &vc[2], &vc[1], 0, civs, l->next_level );
     if ( !l->next_level->idle ) {
-      vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level );
-      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level );
+      diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
       test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff );
     }    
 #ifdef HAVE_TM1p1
     if(g.n_flavours == 2) {
       if (l->depth==0) 
-        tau1_gamma5_PRECISION( vp2, vp1, l, no_threading );
+        tau1_gamma5_PRECISION( &vp[1], &vp[0], l, no_threading );
       else
-        coarse_tau1_gamma5_PRECISION( vp2, vp1, 0, ivs, l );
-      restrict_PRECISION( vc2, vp2, l, no_threading );
-      coarse_tau1_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level );
+        coarse_tau1_gamma5_PRECISION( &vp[1], &vp[0], 0, ivs, l );
+      restrict_PRECISION( &vc[1], &vp[1], l, no_threading );
+      coarse_tau1_gamma5_PRECISION( &vc[2], &vc[1], 0, civs, l->next_level );
       if ( !l->next_level->idle ) {
-        vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level );
-        diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+        vector_PRECISION_minus( &vc[1], &vc[0], &vc[2], 0, civs, l->next_level );
+        diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
         test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff );
       }    
     }
@@ -813,32 +653,32 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
     END_LOCKED_MASTER(threading)
 
     START_LOCKED_MASTER(threading)
-    vector_PRECISION_define( vp2, 0, 0, ivs, l );
+    vector_PRECISION_define( &vp[1], 0, 0, ivs, l );
     if (l->depth==0) 
-      add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs );
+      add_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.odd_proj, ivs );
     else
-      coarse_add_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l );
-    restrict_PRECISION( vc2, vp2, l, no_threading );
+      coarse_add_block_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.odd_proj, ivs, l );
+    restrict_PRECISION( &vc[1], &vp[1], l, no_threading );
     
-    vector_PRECISION_scale( vc2, vc2, -1.0, 0, civs, l->next_level );
-    coarse_add_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level );
-    diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+    vector_PRECISION_scale( &vc[1], &vc[1], -1.0, 0, civs, l->next_level );
+    coarse_add_block_diagonal_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level );
+    diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
     test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff );
     END_LOCKED_MASTER(threading)  
 
 #ifdef HAVE_TM
     START_LOCKED_MASTER(threading)
     if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) {
-      vector_PRECISION_define( vp2, 0, 0, ivs, l );
+      vector_PRECISION_define( &vp[1], 0, 0, ivs, l );
       if (l->depth==0) 
-        add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs );
+        add_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.tm_term, ivs );
       else
-        coarse_add_anti_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l );
-      restrict_PRECISION( vc2, vp2, l, no_threading );
+        coarse_add_anti_block_diagonal_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.tm_term, ivs, l );
+      restrict_PRECISION( &vc[1], &vp[1], l, no_threading );
       
-      vector_PRECISION_scale( vc2, vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level );
-      coarse_add_anti_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level );
-      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      vector_PRECISION_scale( &vc[1], &vc[1], -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level );
+      coarse_add_anti_block_diagonal_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.tm_term, civs, l->next_level );
+      diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
       test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff );
     }
     END_LOCKED_MASTER(threading)  
@@ -848,16 +688,16 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
     START_LOCKED_MASTER(threading)
     if ( g.n_flavours == 2 &&
          ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) {
-      vector_PRECISION_define( vp2, 0, 0, ivs, l );
+      vector_PRECISION_define( &vp[1], 0, 0, ivs, l );
       if (l->depth==0) 
-        apply_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs );
+        apply_doublet_coupling_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.epsbar_term, ivs );
       else
-        coarse_add_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l );
-      restrict_PRECISION( vc2, vp2, l, no_threading );
+        coarse_add_doublet_coupling_PRECISION( &vp[1], &vp[0], l->s_PRECISION.op.epsbar_term, ivs, l );
+      restrict_PRECISION( &vc[1], &vp[1], l, no_threading );
       
-      vector_PRECISION_scale( vc2, vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level );
-      coarse_add_doublet_coupling_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level );
-      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      vector_PRECISION_scale( &vc[1], &vc[1], -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level );
+      coarse_add_doublet_coupling_PRECISION( &vc[1], &vc[0], l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level );
+      diff = global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[0], 0, civs, l->next_level, no_threading );
       test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff );
     }
     END_LOCKED_MASTER(threading)  
@@ -865,30 +705,30 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
 
     if ( l->level > 0 ) {
       START_LOCKED_MASTER(threading)
-      interpolate3_PRECISION( vp1, vc1, l, no_threading );
+      interpolate3_PRECISION( &vp[0], &vc[0], l, no_threading );
 
-      apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );      
+      apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading );      
       
 #ifdef HAVE_TM
       if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
         if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) {  
-          vector_PRECISION_scale( vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l );
+          vector_PRECISION_scale( &vp[2], &vp[0], (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l );
           if(l->depth == 0)
-            add_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs );
+            add_diagonal_PRECISION( &vp[1], &vp[2], l->p_PRECISION.op->tm_term, ivs );
           else
-            coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs, l );
+            coarse_add_anti_block_diagonal_PRECISION( &vp[1], &vp[2], l->p_PRECISION.op->tm_term, ivs, l );
         }
 #endif      
-      restrict_PRECISION( vc2, vp2, l, no_threading );
+      restrict_PRECISION( &vc[1], &vp[1], l, no_threading );
 
       if ( !l->next_level->idle ) {
         if ( l->level==1 && g.odd_even )
-          coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, no_threading );
+          coarse_odd_even_PRECISION_test( &vc[2], &vc[0], l->next_level, no_threading );
         else
-          apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading );
+          apply_operator_PRECISION( &vc[2], &vc[0], &(l->next_level->p_PRECISION), l->next_level, no_threading );
         
-        vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level );
-        diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
+        vector_PRECISION_minus( &vc[2], &vc[1], &vc[2], 0, civs, l->next_level );
+        diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading );
 
         if ( l->level==1 && g.odd_even ) {
           test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff );
@@ -901,14 +741,14 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
       if(threading->n_core>1) {
         if ( !l->next_level->idle ) {
           if ( l->level==1 && g.odd_even )
-            coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, threading );
+            coarse_odd_even_PRECISION_test( &vc[2], &vc[0], l->next_level, threading );
           else
-            apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, threading );
+            apply_operator_PRECISION( &vc[2], &vc[0], &(l->next_level->p_PRECISION), l->next_level, threading );
         }
         START_LOCKED_MASTER(threading)
         if ( !l->next_level->idle ) {
-          vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level );
-          diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
+          vector_PRECISION_minus( &vc[2], &vc[1], &vc[2], 0, civs, l->next_level );
+          diff = global_norm_PRECISION( &vc[2], 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( &vc[1], 0, civs, l->next_level, no_threading );
           if ( l->level==1 && g.odd_even ) { //TODO: this test doesn't work without SSE!!
             test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff );
           } else {
@@ -921,29 +761,33 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
     START_LOCKED_MASTER(threading)
 
     if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) {
-      vector_PRECISION_define_random( vp1, 0, ivs, l );
-      block_to_oddeven_PRECISION( vp4, vp1, l, no_threading );
-      coarse_diag_ee_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading );
-      coarse_diag_oo_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading );
-      coarse_hopping_term_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading );
-      oddeven_to_block_PRECISION( vp4, vp3, l, no_threading );
-      apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );
-      vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l );
-      diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading );
+      vector_PRECISION_define_random( &vp[0], 0, ivs, l );
+      block_to_oddeven_PRECISION( &vp[3], &vp[0], l, no_threading );
+      coarse_diag_ee_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), l, no_threading );
+      coarse_diag_oo_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), l, no_threading );
+      coarse_hopping_term_PRECISION( &vp[2], &vp[3], &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading );
+      oddeven_to_block_PRECISION( &vp[3], &vp[2], l, no_threading );
+      apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading );
+      vector_PRECISION_minus( &vp[3], &vp[3], &vp[1], 0, ivs, l );
+      diff = global_norm_PRECISION( &vp[3], 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp[1], 0, ivs, l, no_threading );
       test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff );
      
-      block_to_oddeven_PRECISION( vp4, vp1, l, no_threading );
-      coarse_odd_even_PRECISION_test( vp3, vp4, l, no_threading );
-      oddeven_to_block_PRECISION( vp4, vp3, l, no_threading );
-      apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );
-      vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l );
-      diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading );
+      block_to_oddeven_PRECISION( &vp[3], &vp[0], l, no_threading );
+      coarse_odd_even_PRECISION_test( &vp[2], &vp[3], l, no_threading );
+      oddeven_to_block_PRECISION( &vp[3], &vp[2], l, no_threading );
+      apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading );
+      vector_PRECISION_minus( &vp[3], &vp[3], &vp[1], 0, ivs, l );
+      diff = global_norm_PRECISION( &vp[3], 0, ivs, l, no_threading ) / global_norm_PRECISION( &vp[1], 0, ivs, l, no_threading );
       test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff );
     }
+    
+    for(int i=0; i<4; i++)
+      vector_PRECISION_free( &vp[i], l, threading );
 
-    FREE( vp1, complex_PRECISION, 4*vs );
-    FREE( vc1, complex_PRECISION, 3*cvs );
-    END_LOCKED_MASTER(threading)
+    for(int i=0; i<3; i++)
+      vector_PRECISION_free( &vc[i], l->next_level, threading );
+    
+   END_LOCKED_MASTER(threading)
     
     if ( g.method != 6 && l->next_level->level > 0  && !l->next_level->idle ) {
       schwarz_PRECISION_mvm_testfun( &(l->next_level->s_PRECISION), l->next_level, threading );
diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h
index 3af0655..a33c594 100644
--- a/src/coarse_operator_generic.h
+++ b/src/coarse_operator_generic.h
@@ -22,53 +22,50 @@
 #ifndef COARSE_OPERATOR_PRECISION_HEADER
   #define COARSE_OPERATOR_PRECISION_HEADER
 
-  #include "blas_vectorized.h"
-
   struct Thread;
   
   void coarse_operator_PRECISION_alloc( level_struct *l );
   void coarse_operator_PRECISION_free( level_struct *l );
-  void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l );
   void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l );
   void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading );
   void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   
-  void set_coarse_self_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2,
+  void set_coarse_self_coupling_PRECISION( vector_PRECISION *buffer1, vector_PRECISION *buffer2,
                                            vector_PRECISION *V, const int n, level_struct *l );
-  void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2,
+  void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION *buffer1, vector_PRECISION *buffer2,
                                                vector_PRECISION *V, const int mu, const int n, level_struct *l );
 
-  void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void coarse_self_couplings_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                         operator_PRECISION_struct *op, int start, int end, level_struct *l );
-  void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, 
+  void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, 
                                                  config_PRECISION clover, int length, level_struct *l );
   
-  void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l );
-  void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l );
-  void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void coarse_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l );
+  void coarse_tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l );
+  void apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                         operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
+  void g5D_apply_coarse_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op,
                                             level_struct *l, struct Thread *threading );
-  void apply_coarse_operator_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void apply_coarse_operator_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start,
+  void coarse_block_operator_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start,
                                         schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, 
+  void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, 
                                                   schwarz_PRECISION_struct *s, level_struct *l );
 
-  void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l );
+  void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l );
 
-  void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l );
+  void set_block_diagonal_PRECISION( vector_PRECISION *spin_0_1, vector_PRECISION *spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l );
 
-  void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l );
+  void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION block, level_struct *l );
  
   void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading );
   
   // eta += D*phi, D stored columnwise
-  static inline void mv_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                   const vector_PRECISION phi, const register int n ) {
+  static inline void mv_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                   const buffer_PRECISION phi, const register int n ) {
     register int i, j, k=0;
 
     for ( i=0; i<n; i++ )
@@ -77,8 +74,8 @@
   }
 
   // eta -= D*phi, D stored columnwise
-  static inline void nmv_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                    const vector_PRECISION phi, const register int n ) {
+  static inline void nmv_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                    const buffer_PRECISION phi, const register int n ) {
     register int i, j, k=0;
     
     for ( i=0; i<n; i++ )
@@ -87,8 +84,8 @@
   }
 
   // eta += D^Dagger*phi, D stored columnwise
-  static inline void mvh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                    const vector_PRECISION phi, const register int n ) {
+  static inline void mvh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                    const buffer_PRECISION phi, const register int n ) {
     register int i, j, k=0;    
 
     for ( i=0; i<n; i++ )
@@ -97,8 +94,8 @@
   }
 
   // eta -= D^Dagger*phi, D stored columnwise
-  static inline void nmvh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                     const vector_PRECISION phi, const register int n ) {
+  static inline void nmvh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                     const buffer_PRECISION phi, const register int n ) {
     register int i, j, k=0; 
 
     for ( i=0; i<n; i++ )
@@ -107,8 +104,8 @@
   }
 
   // eta = D*phi, D hermitian and stored columnwise packed
-  static inline void mvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                    const vector_PRECISION phi, const register int n ) {
+  static inline void mvp_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                    const buffer_PRECISION phi, const register int n ) {
     register int i, j, k;
 
     eta[0] = D[0]*phi[0];
@@ -124,8 +121,8 @@
   }
 
   // eta += D*phi, D hermitian and stored columnwise packed
-  static inline void pmvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                    const vector_PRECISION phi, const register int n ) {
+  static inline void pmvp_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                    const buffer_PRECISION phi, const register int n ) {
     register int i, j, k;
 
     eta[0] += D[0]*phi[0];
@@ -141,8 +138,8 @@
   }
 
   // eta += D*phi, D hermitian and stored columnwise packed
-  static inline void mmvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                     const vector_PRECISION phi, const register int n ) {
+  static inline void mmvp_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                     const buffer_PRECISION phi, const register int n ) {
     register int i, j, k;
 
     eta[0] -= D[0]*phi[0];
@@ -158,8 +155,8 @@
   }
 
   // eta += D*phi, D anti-hermitian and stored columnwise packed
-  static inline void pamvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                    const vector_PRECISION phi, const register int n ) {
+  static inline void pamvp_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                    const buffer_PRECISION phi, const register int n ) {
     register int i, j, k;
 
     eta[0] += D[0]*phi[0];
@@ -175,8 +172,8 @@
   }
   
   // eta -= D*phi, D anti-hermitian and stored columnwise packed
-  static inline void mamvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
-                                    const vector_PRECISION phi, const register int n ) {
+  static inline void mamvp_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D,
+                                    const buffer_PRECISION phi, const register int n ) {
     register int i, j, k;
 
     eta[0] -= D[0]*phi[0];
@@ -191,7 +188,7 @@
     }
   }
 
-  static inline void coarse_self_couplings_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_self_couplings_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                              config_PRECISION clover, int length, level_struct *l ) {
     
     int site_var = l->num_lattice_site_var,
@@ -199,7 +196,7 @@
       clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2,
       clover_step_size2 = SQUARE(num_eig_vect);
     config_PRECISION clover_pt = clover;
-    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length;
     // U(x) = [ A B      , A=A*, D=D*, C = -B*
     //          C D ]
     // storage order: upper triangle of A, upper triangle of D, B, columnwise
@@ -257,13 +254,13 @@
       }
   }
 
-  static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                         config_PRECISION block, int length, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
       block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
     config_PRECISION block_pt = block;
-    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length;
     // U(x) = [ A 0      , A=A*, D=D* diag. excluded
     //          0 D ]
     // storage order: upper triangle of A, upper triangle of D, columnwise
@@ -294,13 +291,13 @@
       }
   }
 
-  static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                                config_PRECISION block, int length, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
       block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
     config_PRECISION block_pt = block;
-    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length;
     // U(x) = [ A 0      , A=-A*, D=-D* diag. excluded
     //          0 D ]
     // storage order: upper triangle of A, upper triangle of D, columnwise
@@ -331,14 +328,14 @@
       }
   }
 
-  static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                           config_PRECISION block, int length, level_struct *l ) {
     
 #ifdef HAVE_TM1p1
     int num_eig_vect = l->num_parent_eig_vect,
       block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
     config_PRECISION block_pt = block;
-    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer, phi_end_pt=phi->vector_buffer+length;
     // U(x) = [ 0 A      , A=-A*, D=-D* diag. excluded
     //          D 0 ]
     // storage order: upper triangle of A, upper triangle of D, columnwise
@@ -360,11 +357,12 @@
 #endif
 }
   
-  static inline void coarse_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                             config_PRECISION D, level_struct *l ) {
   
     int num_eig_vect = l->num_parent_eig_vect,
         num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer;
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -373,62 +371,63 @@
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
       // A  
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//1
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//1
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // C
-      eta += num_eig_vect;//2
-      phi -= num_eig_vect;//0
+      eta_pt += num_eig_vect;//2
+      phi_pt -= num_eig_vect;//0
       D += num_eig_vect2;
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//1
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//1
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // B
-      eta -= 3*num_eig_vect;//0
-      phi += num_eig_vect;//2
+      eta_pt -= 3*num_eig_vect;//0
+      phi_pt += num_eig_vect;//2
       D += num_eig_vect2;
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//3
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//3
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D
-      eta += num_eig_vect;//2
-      phi -= num_eig_vect;//2
+      eta_pt += num_eig_vect;//2
+      phi_pt -= num_eig_vect;//2
       D += num_eig_vect2;
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//3
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//3
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
     } else {
 #endif
       // A  
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // C
-      eta += num_eig_vect;
+      eta_pt += num_eig_vect;
       D += num_eig_vect2;
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // B
-      phi += num_eig_vect;
-      eta -= num_eig_vect;
+      phi_pt += num_eig_vect;
+      eta_pt -= num_eig_vect;
       D += num_eig_vect2;
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D
-      eta += num_eig_vect;
+      eta_pt += num_eig_vect;
       D += num_eig_vect2;
-      nmv_PRECISION( eta, D, phi, num_eig_vect );
+      nmv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
 #ifdef HAVE_TM1p1
     }
 #endif
   }
 
 
-  static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                      config_PRECISION D, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
         num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer;
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -437,61 +436,62 @@
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
       // A* 
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//1
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//1
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -C*
-      eta -= num_eig_vect;//0
-      phi += num_eig_vect;//2
+      eta_pt -= num_eig_vect;//0
+      phi_pt += num_eig_vect;//2
       D += num_eig_vect2;
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//3
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//3
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -B*
-      eta += num_eig_vect;//2
-      phi -= 3*num_eig_vect;//0
+      eta_pt += num_eig_vect;//2
+      phi_pt -= 3*num_eig_vect;//0
       D += num_eig_vect2;
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//1
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//1
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D*
-      eta -= num_eig_vect;//2
-      phi += num_eig_vect;//2
+      eta_pt -= num_eig_vect;//2
+      phi_pt += num_eig_vect;//2
       D += num_eig_vect2;
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//3
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//3
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
     } else {
 #endif
       // A* 
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -C*
-      phi += num_eig_vect;
+      phi_pt += num_eig_vect;
       D += num_eig_vect2;
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -B*
-      eta += num_eig_vect;
-      phi -= num_eig_vect;
+      eta_pt += num_eig_vect;
+      phi_pt -= num_eig_vect;
       D += num_eig_vect2;
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D*
-      phi += num_eig_vect;
+      phi_pt += num_eig_vect;
       D += num_eig_vect2;
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
 #ifdef HAVE_TM1p1
     }
 #endif
   }
   
-  static inline void coarse_n_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_n_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                               config_PRECISION D, level_struct *l ) {
   
     int num_eig_vect = l->num_parent_eig_vect,
         num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer;
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -500,61 +500,62 @@
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
       // A  
-      mv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//1
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//1
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // C
-      eta += num_eig_vect;//2
-      phi -= num_eig_vect;//0
+      eta_pt += num_eig_vect;//2
+      phi_pt -= num_eig_vect;//0
       D += num_eig_vect2;
-      mv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//1
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//1
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // B
-      eta -= 3*num_eig_vect;//0
-      phi += num_eig_vect;//2
+      eta_pt -= 3*num_eig_vect;//0
+      phi_pt += num_eig_vect;//2
       D += num_eig_vect2;
-      mv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//3
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//3
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D
-      eta += num_eig_vect;//2
-      phi -= num_eig_vect;//2
+      eta_pt += num_eig_vect;//2
+      phi_pt -= num_eig_vect;//2
       D += num_eig_vect2;
-      mv_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//3
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//3
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
     } else {
 #endif
       // A  
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // C
-      eta += num_eig_vect;
+      eta_pt += num_eig_vect;
       D += num_eig_vect2;
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // B
-      phi += num_eig_vect;
-      eta -= num_eig_vect;
+      phi_pt += num_eig_vect;
+      eta_pt -= num_eig_vect;
       D += num_eig_vect2;
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D
-      eta += num_eig_vect;
+      eta_pt += num_eig_vect;
       D += num_eig_vect2;
-      mv_PRECISION( eta, D, phi, num_eig_vect );
+      mv_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
 #ifdef HAVE_TM1p1
     }
 #endif
   }
 
-  static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                                        config_PRECISION D, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
         num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta_pt=eta->vector_buffer;
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -563,164 +564,168 @@
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
       // A* 
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//1
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//1
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -C*
-      eta -= num_eig_vect;//0
-      phi += num_eig_vect;//2
+      eta_pt -= num_eig_vect;//0
+      phi_pt += num_eig_vect;//2
       D += num_eig_vect2;
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//1
-      phi += num_eig_vect;//3
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//1
+      phi_pt += num_eig_vect;//3
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -B*
-      eta += num_eig_vect;//2
-      phi -= 3*num_eig_vect;//0
+      eta_pt += num_eig_vect;//2
+      phi_pt -= 3*num_eig_vect;//0
       D += num_eig_vect2;
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//1
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//1
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D*
-      eta -= num_eig_vect;//2
-      phi += num_eig_vect;//2
+      eta_pt -= num_eig_vect;//2
+      phi_pt += num_eig_vect;//2
       D += num_eig_vect2;
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
-      eta += num_eig_vect;//3
-      phi += num_eig_vect;//3
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
+      eta_pt += num_eig_vect;//3
+      phi_pt += num_eig_vect;//3
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
     } else {
 #endif
       // A* 
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -C*
-      phi += num_eig_vect;
+      phi_pt += num_eig_vect;
       D += num_eig_vect2;
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // -B*
-      eta += num_eig_vect;
-      phi -= num_eig_vect;
+      eta_pt += num_eig_vect;
+      phi_pt -= num_eig_vect;
       D += num_eig_vect2;
-      nmvh_PRECISION( eta, D, phi, num_eig_vect );
+      nmvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
       // D*
-      phi += num_eig_vect;
+      phi_pt += num_eig_vect;
       D += num_eig_vect2;
-      mvh_PRECISION( eta, D, phi, num_eig_vect );
+      mvh_PRECISION( eta_pt, D, phi_pt, num_eig_vect );
 #ifdef HAVE_TM1p1
     }
 #endif
   }
 
-  static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, 
-                                                     vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
+  static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, 
+                                                     vector_PRECISION *phi, config_PRECISION D, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
         num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer;
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
     // note: minus sign of D = self_coupling - hopping_term is added here
 
     // A  
-    mv_PRECISION( eta1, D, phi, num_eig_vect );
+    mv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // C
-    eta1 += num_eig_vect;
+    eta1_pt += num_eig_vect;
     D += num_eig_vect2;
-    mv_PRECISION( eta1, D, phi, num_eig_vect );
+    mv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // B
-    phi += num_eig_vect;
+    phi_pt += num_eig_vect;
     D += num_eig_vect2;
-    mv_PRECISION( eta2, D, phi, num_eig_vect );
+    mv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
     // D
-    eta2 += num_eig_vect;
+    eta2_pt += num_eig_vect;
     D += num_eig_vect2;
-    mv_PRECISION( eta2, D, phi, num_eig_vect );
+    mv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
   }
 
 
-  static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2,
-                                                              vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
+  static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2,
+                                                              vector_PRECISION *phi, config_PRECISION D, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
-        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);  
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer;  
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
     // note: minus sign of D = self_coupling - hopping_term is added here
 
     // A* 
-    mvh_PRECISION( eta1, D, phi, num_eig_vect );
+    mvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // -C*
-    phi += num_eig_vect;
+    phi_pt += num_eig_vect;
     D += num_eig_vect2;
-    nmvh_PRECISION( eta2, D, phi, num_eig_vect );
+    nmvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
     // -B*
-    eta1 += num_eig_vect;
-    phi -= num_eig_vect;
+    eta1_pt += num_eig_vect;
+    phi_pt -= num_eig_vect;
     D += num_eig_vect2;
-    nmvh_PRECISION( eta1, D, phi, num_eig_vect );
+    nmvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // D*
-    eta2 += num_eig_vect;
-    phi += num_eig_vect;
+    eta2_pt += num_eig_vect;
+    phi_pt += num_eig_vect;
     D += num_eig_vect2;
-    mvh_PRECISION( eta2, D, phi, num_eig_vect );
+    mvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
   }
 
-  static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2,
-                                                       vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
+  static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2,
+                                                       vector_PRECISION *phi, config_PRECISION D, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
         num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer;
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
     // note: minus sign of D = self_coupling - hopping_term is added here
 
     // A  
-    nmv_PRECISION( eta1, D, phi, num_eig_vect );
+    nmv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // C
-    eta1 += num_eig_vect;
+    eta1_pt += num_eig_vect;
     D += num_eig_vect2;
-    nmv_PRECISION( eta1, D, phi, num_eig_vect );
+    nmv_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // B
-    phi += num_eig_vect;
+    phi_pt += num_eig_vect;
     D += num_eig_vect2;
-    nmv_PRECISION( eta2, D, phi, num_eig_vect );
+    nmv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
     // D
-    eta2 += num_eig_vect;
+    eta2_pt += num_eig_vect;
     D += num_eig_vect2;
-    nmv_PRECISION( eta2, D, phi, num_eig_vect );
+    nmv_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
   }
 
 
-  static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2,
-                                                                vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
+  static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2,
+                                                                vector_PRECISION *phi, config_PRECISION D, level_struct *l ) {
     
     int num_eig_vect = l->num_parent_eig_vect,
-        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);  
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    buffer_PRECISION phi_pt=phi->vector_buffer, eta1_pt=eta1->vector_buffer, eta2_pt=eta2->vector_buffer;  
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
     // note: minus sign of D = self_coupling - hopping_term is added here
 
     // A* 
-    nmvh_PRECISION( eta1, D, phi, num_eig_vect );
+    nmvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // -C*
-    phi += num_eig_vect;
+    phi_pt += num_eig_vect;
     D += num_eig_vect2;
-    mvh_PRECISION( eta2, D, phi, num_eig_vect );
+    mvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
     // -B*
-    eta1 += num_eig_vect;
-    phi -= num_eig_vect;
+    eta1_pt += num_eig_vect;
+    phi_pt -= num_eig_vect;
     D += num_eig_vect2;
-    mvh_PRECISION( eta1, D, phi, num_eig_vect );
+    mvh_PRECISION( eta1_pt, D, phi_pt, num_eig_vect );
     // D*
-    eta2 += num_eig_vect;
-    phi += num_eig_vect;
+    eta2_pt += num_eig_vect;
+    phi_pt += num_eig_vect;
     D += num_eig_vect2;
-    nmvh_PRECISION( eta2, D, phi, num_eig_vect );
+    nmvh_PRECISION( eta2_pt, D, phi_pt, num_eig_vect );
   }
 
 #endif
diff --git a/src/coarsening_generic.c b/src/coarsening_generic.c
index ae7893b..a3c6313 100644
--- a/src/coarsening_generic.c
+++ b/src/coarsening_generic.c
@@ -30,7 +30,7 @@ void interpolation_PRECISION_struct_init( interpolation_PRECISION_struct *is ) {
   is->test_vector = NULL;
   is->interpolation = NULL;
   is->eigenvalues = NULL;
-  is->tmp = NULL;
+  vector_PRECISION_init(&(is->tmp));
   is->bootstrap_vector = NULL;
   is->bootstrap_eigenvalues = NULL;
 }
diff --git a/src/data_generic.c b/src/data_generic.c
index 950c814..c666644 100644
--- a/src/data_generic.c
+++ b/src/data_generic.c
@@ -22,7 +22,7 @@
 #include "main.h"
 
 // vector storage for PRECISION precision
-void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ) {
+void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) {
   
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
@@ -39,7 +39,7 @@ void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int
 }
 
 
-void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l ) {
+void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l ) {
   
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
@@ -47,10 +47,29 @@ void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, l
   if ( phi != NULL ) {
     int i;
     for ( i=start; i<end; i++ )
-      phi[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I;
+      phi->vector_buffer[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I;
   } else {
     error0("Error in \"vector_PRECISION_define_random\": pointer is null\n");
   }
   if(thread == 0 && start != end)
     PROF_PRECISION_STOP( _SET, 1 );
 }
+
+
+void vector_PRECISION_define_random_new( vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
+  
+  int start, end;
+  compute_core_start_end( 0, (phi->size)*(phi->num_vect), &start, &end, l, threading );
+  int thread = omp_get_thread_num();
+  if(thread == 0)
+    PROF_PRECISION_START( _SET );
+  if ( phi != NULL ) {
+    int i;
+    for ( i=start; i<end; i++ )
+      phi->vector_buffer[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I;
+  } else {
+    error0("Error in \"vector_PRECISION_define_random\": pointer is null\n");
+  }
+  if(thread == 0)
+    PROF_PRECISION_STOP( _SET, 1 );
+}
diff --git a/src/data_generic.h b/src/data_generic.h
index b236ab4..76fd875 100644
--- a/src/data_generic.h
+++ b/src/data_generic.h
@@ -22,7 +22,7 @@
 #ifndef DATA_PRECISION_HEADER
   #define DATA_PRECISION_HEADER
   
-  void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l );
-  void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l );
-  
+  void buffer_PRECISION_define( complex_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l );
+  void vector_PRECISION_define_random( vector_PRECISION *phi, int start, int end, level_struct *l );
+  void vector_PRECISION_define_random_new( vector_PRECISION *phi, level_struct *l, struct Thread *threading ); 
 #endif
diff --git a/src/dirac.c b/src/dirac.c
index 068e8a7..8d85319 100644
--- a/src/dirac.c
+++ b/src/dirac.c
@@ -44,14 +44,14 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) {
   
 #ifdef HAVE_TM
   if ( g.mu + g.mu_even_shift == 0 && g.mu + g.mu_odd_shift == 0 )
-    vector_double_define( op->tm_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l );
+    buffer_double_define( op->tm_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l );
   else
     tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, op, l, no_threading );  
 #endif
 
 #ifdef HAVE_TM1p1
   if ( g.epsbar == 0 && g.epsbar_ig5_even_shift == 0 && g.epsbar_ig5_odd_shift == 0 ) 
-    vector_double_define( op->epsbar_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l );
+    buffer_double_define( op->epsbar_term, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l );
   else
     epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, op, l, no_threading );  
 #endif
@@ -86,7 +86,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) {
     mat_free( &Qstore, 3 );
     spin_free( 4, 4 );
   } else {
-    vector_double_define( op->clover, 4+op->m0, 0, l->inner_vector_size, l );
+    buffer_double_define( op->clover, 4+op->m0, 0, l->inner_vector_size, l );
   }
 }
 
@@ -436,7 +436,7 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) {
   
   int t, z, y, x, mu, nu, *ll = l->local_lattice, ls[4], le[4];
   long int i, j, send_size, max_size;
-  vector_double buffer1 = NULL, buffer2 = NULL, buffer3 = NULL, buffer4 = NULL;
+  buffer_double buffer1 = NULL, buffer2 = NULL, buffer3 = NULL, buffer4 = NULL;
 
   max_size = 0;
   for ( mu=0; mu<4; mu++ ) {
@@ -528,11 +528,11 @@ void SU3_ghost_update( SU3_storage *U, level_struct *l ) {
     send_size = i;
     ASSERT(send_size<=max_size);
   }
-  
-  FREE( buffer1, complex_double, max_size );
-  FREE( buffer2, complex_double, max_size );
-  FREE( buffer3, complex_double, max_size );
+  FREE( buffer1, complex_double, max_size );		
+  FREE( buffer2, complex_double, max_size );		
+  FREE( buffer3, complex_double, max_size );		
   FREE( buffer4, complex_double, max_size );
+
 }
 
 
diff --git a/src/dirac_generic.c b/src/dirac_generic.c
index 6be33de..ab420ff 100644
--- a/src/dirac_generic.c
+++ b/src/dirac_generic.c
@@ -21,12 +21,12 @@
 
 #include "main.h"
 
-void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end,
+void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end,
                        level_struct *l, struct Thread *threading ) {
 
   int nv = l->num_lattice_site_var;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  vector_PRECISION leta_end = eta+end;
+  buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start;
+  buffer_PRECISION leta_end = eta->vector_buffer+end;
 
 #ifdef PROFILING
   START_MASTER(threading)
@@ -79,9 +79,6 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC
 #endif
 
   } else {
-
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-
     config_PRECISION clover = op->clover+(start/nv)*42;
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
@@ -123,28 +120,153 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC
 #ifdef HAVE_TM1p1
     }
 #endif
-
-#else
+  }
 
 #ifdef HAVE_TM1p1
-    PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized;
-#else
-    PRECISION *clover = op->clover_vectorized;
-#endif
-    clover += start*12;
-    while ( leta < leta_end ) { // tm_term included in the clover vectorized
-      sse_site_clover_PRECISION( (PRECISION*) leta, (PRECISION*) lphi, clover );
-      leta += nv; lphi += nv;
-      clover += 12*nv;
+  config_PRECISION eps_term = op->epsbar_term+(start/nv)*12;  
+  lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start;
+  if ( g.n_flavours == 2 &&
+       ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) )
+    while ( leta < leta_end ) { 
+      lphi += 6;
+      FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; )
+      lphi -= 12;
+      eps_term -= 6;
+      FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; )
+      lphi += 6;
     }
-    
+#endif
+
+  
+#ifdef PROFILING
+  START_MASTER(threading)
+  PROF_PRECISION_STOP( _SC, 1 );
+  END_MASTER(threading)
 #endif
     
+}
+
+
+void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end,
+                       level_struct *l, struct Thread *threading ) {
+
+  int nv = l->num_lattice_site_var, n_vect=g.num_rhs_vect, i, j, jj;
+  buffer_PRECISION lphi = phi->vector_buffer+start*n_vect, leta = eta->vector_buffer+start*n_vect;
+  buffer_PRECISION leta_end = eta->vector_buffer+end*n_vect;
+#ifdef PROFILING
+  START_MASTER(threading)
+  PROF_PRECISION_START( _SC );
+  END_MASTER(threading)
+#endif
+
+#ifdef HAVE_TM
+  config_PRECISION tm_term = op->tm_term+(start/nv)*12;
+#endif
+
+  if ( g.csw == 0.0 ) {
+
+    config_PRECISION clover = op->clover+(start/nv)*12;
+/*#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) 
+        while ( leta < leta_end ) {
+          FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+          clover -= 6;
+          tm_term -= 6;
+          FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+          FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+          clover -= 6;
+          tm_term -= 6;
+          FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+        }
+      else
+#endif
+        while ( leta < leta_end ) {
+          FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+          clover -= 6;
+          FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+          clover -= 6;
+          FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+        }
+    } else {
+#endif*/
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) {
+        while ( leta < leta_end )
+          for( i=0; i<12; i++ ) {
+            VECTOR_LOOP(j, n_vect, jj, *leta = (*lphi)*((*clover)+(*tm_term));
+                                       leta++;
+                                       lphi++;)
+            clover++;
+            tm_term++;
+      }
+  }// else
+#endif
+      while ( leta < leta_end )
+        for( i=0; i<12; i++ ){
+          VECTOR_LOOP(j, n_vect, jj, *leta = (*lphi)*(*clover);
+                                     leta++;
+                                     lphi++;)
+          clover++;
   }
+/*#ifdef HAVE_TM1p1
+    }
+#endif*/
+
+  } else {
 
+    config_PRECISION clover = op->clover+(start/nv)*42;
+/*#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) 
+        while ( leta < leta_end ) {
+          doublet_site_clover_PRECISION( leta, lphi, clover );
+          clover+=42;
+          FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          tm_term -= 6;
+          FOR6( *leta -=(*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          tm_term -= 6;
+          FOR6( *leta -= (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+        }
+      else
+#endif
+        while ( leta < leta_end ) {
+          doublet_site_clover_PRECISION( leta, lphi, clover );
+          leta+=24; lphi+=24;
+          clover+=42;
+        }
+    } else {
+#endif*/
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) 
+        while ( leta < leta_end ) {
+          site_clover_PRECISION_new( leta, lphi, clover );
+          for( i=0; i<12; i++ ){
+            VECTOR_LOOP(j, n_vect, jj, *leta += (*lphi)*(*tm_term);
+                                        leta++;
+                                        lphi++;)
+            tm_term++;
+          }
+          clover+=42;
+        }
+     // else
+#endif
+        while ( leta < leta_end ) {
+          site_clover_PRECISION_new( leta, lphi, clover );
+          leta+=12*n_vect; lphi+=12*n_vect;
+          clover+=42;
+        }
+/*#ifdef HAVE_TM1p1
+    }
+#endif  */
+  }
+/*
 #ifdef HAVE_TM1p1
   config_PRECISION eps_term = op->epsbar_term+(start/nv)*12;  
-  lphi = phi+start, leta = eta+start;
+  lphi = phi->vector_buffer+start+phi_shift, leta = eta->vector_buffer+start+eta_shift;
   if ( g.n_flavours == 2 &&
        ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) )
     while ( leta < leta_end ) { 
@@ -156,7 +278,7 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC
       lphi += 6;
     }
 #endif
-
+*/
   
 #ifdef PROFILING
   START_MASTER(threading)
@@ -166,59 +288,52 @@ void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PREC
     
 }
 
-static void spin0and1_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) {
+
+static void spin0and1_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) {
   
-  vector_PRECISION eta_end = eta + l->inner_vector_size;
+  buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer;
   if ( g.csw == 0.0 ) {
-    while ( eta < eta_end ) {
-      FOR6( *eta = (*phi)*(*clover); eta++; phi++; clover++; )
-      FOR6( *eta = _COMPLEX_PRECISION_ZERO; eta++; )
-      phi+=6; clover+=6;
+    while ( leta < eta_end ) {
+      FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; )
+      FOR6( *leta = _COMPLEX_PRECISION_ZERO; leta++; )
+      lphi+=6; clover+=6;
     }
   } else {
-    while ( eta < eta_end ) {
-      spin0and1_site_clover_PRECISION( eta, phi, clover );
-      eta+=12; phi+=12; clover+=42;
+    while ( leta < eta_end ) {
+      spin0and1_site_clover_PRECISION( leta, lphi, clover );
+      leta+=12; lphi+=12; clover+=42;
     }
   }
 }
 
 
-static void spin2and3_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) {
+static void spin2and3_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, config_PRECISION clover, level_struct *l ) {
   
-  vector_PRECISION eta_end = eta + l->inner_vector_size;
+  buffer_PRECISION eta_end = eta->vector_buffer + l->inner_vector_size, leta = eta->vector_buffer, lphi = phi->vector_buffer;
   if ( g.csw == 0.0 ) {
-    while ( eta < eta_end ) {
-      phi+=6; clover+=6;
-      FOR6( *eta = _COMPLEX_PRECISION_ZERO; eta++; )
-      FOR6( *eta = (*phi)*(*clover); eta++; phi++; clover++; )
+    while ( leta < eta_end ) {
+      lphi+=6; clover+=6;
+      FOR6( *leta = _COMPLEX_PRECISION_ZERO; leta++; )
+      FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; )
     }
   } else {
-    while ( eta < eta_end ) {
-      spin2and3_site_clover_PRECISION( eta, phi, clover );
-      eta +=12; phi+=12; clover+=42;
+    while ( leta < eta_end ) {
+      spin2and3_site_clover_PRECISION( leta, lphi, clover );
+      leta +=12; lphi+=12; clover+=42;
     }
   }
 }
 
-void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
+void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
 
   START_UNTHREADED_FUNCTION(threading)
   
   int n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
+  buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start;
 
   // clover term
   clover_PRECISION(eta, phi, &(s->op), start, start+nv*n, l, no_threading );
 
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // block operator vectorized just in the float environment
-  PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96;
-  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96;
-  for ( int mu=0; mu<4; mu++ ) {
-    block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor );
-    block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor );
-  }
-#else
   int i, j, k, *ind;
   config_PRECISION D_pt;
   config_PRECISION D = s->op.D + (start/nv)*36;
@@ -344,28 +459,22 @@ void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
     }
 #ifdef HAVE_TM1p1
   }
-#endif
 #endif
   END_UNTHREADED_FUNCTION(threading)
 }
 
-void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
   int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var;
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX };
-  complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX };
-#else
   int i, j, *nb_pt;
-  vector_PRECISION phi_pt, eta_pt, end_pt;
+  buffer_PRECISION phi_pt, eta_pt, end_pt;
   config_PRECISION D_pt;
-#endif
 
   compute_core_start_end(0, nv*n, &start, &end, l, threading );
 
   SYNC_MASTER_TO_ALL(threading)
 
-  clover_PRECISION(eta, phi, op, start, end, l, threading );
+  clover_PRECISION( eta, phi, op, start, end, l, threading );
 
   START_MASTER(threading)
   PROF_PRECISION_START( _NC ); 
@@ -373,17 +482,13 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2 ) {
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    dprp_PRECISION( prn, phi, start, end );
-#else
     complex_PRECISION pbuf[12];  
-    for ( i=start/2, phi_pt=phi+start; i<end/2; i+=nv/2, phi_pt+=nv ) {
+    for ( i=start/2, phi_pt=phi->vector_buffer+start; i<end/2; i+=nv/2, phi_pt+=nv ) {
       dprp_T_PRECISION( op->prnT+i, phi_pt );
       dprp_Z_PRECISION( op->prnZ+i, phi_pt );
       dprp_Y_PRECISION( op->prnY+i, phi_pt );
       dprp_X_PRECISION( op->prnX+i, phi_pt );
     }
-#endif
     // start communication in negative direction
     START_LOCKED_MASTER(threading)
     ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
@@ -392,11 +497,8 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
     END_LOCKED_MASTER(threading) 
   
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    dprn_su3_PRECISION( prp, phi, op, neighbor, start, end );
-#else
     // project plus dir and multiply with U dagger
-    for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_pt<end_pt; phi_pt+=nv ) {
+    for ( phi_pt=phi->vector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_pt<end_pt; phi_pt+=nv ) {
       // T dir
       j = nv/2*(*nb_pt); nb_pt++;
       dprn_T_PRECISION( pbuf, phi_pt );
@@ -426,7 +528,6 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
       mvmh_PRECISION( op->prpX+j+6, D_pt, pbuf+6 );
       mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9;
     }
-#endif
 
     // start communication in positive direction
     START_LOCKED_MASTER(threading)
@@ -441,11 +542,8 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
     END_LOCKED_MASTER(threading)
      
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    su3_dpbp_PRECISION( eta, prn, op, neighbor, start, end );
-#else 
     // multiply with U and lift up minus dir
-    for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_pt<end_pt; eta_pt+=nv ) {
+    for ( eta_pt=eta->vector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_pt<end_pt; eta_pt+=nv ) {
       // T dir
       j = nv/2*(*nb_pt); nb_pt++;
       mvm_PRECISION( pbuf, D_pt, op->prnT+j );
@@ -475,7 +573,6 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
       mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 );
       dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
     }
-#endif
 
     // wait for communication in positive direction
     START_LOCKED_MASTER(threading)
@@ -486,30 +583,22 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     END_LOCKED_MASTER(threading)
       
     // lift up plus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    dpbn_PRECISION( eta, prp, start, end );
-#else
-    for ( i=start/2, eta_pt=eta+start; i<end/2; i+=12, eta_pt+=24 ) {
+    for ( i=start/2, eta_pt=eta->vector_buffer+start; i<end/2; i+=12, eta_pt+=24 ) {
       dpbn_su3_T_PRECISION( op->prpT+i, eta_pt );
       dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
       dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
       dpbn_su3_X_PRECISION( op->prpX+i, eta_pt );
     }
-#endif
   } else {
 #endif
 
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  prp_PRECISION( prn, phi, start, end );
-#else
   complex_PRECISION pbuf[6];
-  for ( i=start/2, phi_pt=phi+start; i<end/2; i+=6, phi_pt+=12 ) {
+  for ( i=start/2, phi_pt=phi->vector_buffer+start; i<end/2; i+=6, phi_pt+=12 ) {
     prp_T_PRECISION( op->prnT+i, phi_pt );
     prp_Z_PRECISION( op->prnZ+i, phi_pt );
     prp_Y_PRECISION( op->prnY+i, phi_pt );
     prp_X_PRECISION( op->prnX+i, phi_pt );
   }
-#endif
   // start communication in negative direction
   START_LOCKED_MASTER(threading)
   ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
@@ -519,10 +608,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
   END_LOCKED_MASTER(threading) 
   
   // project plus dir and multiply with U dagger
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  prn_su3_PRECISION( prp, phi, op, neighbor, start, end );
-#else
-  for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_pt<end_pt; phi_pt+=12 ) {
+  for ( phi_pt=phi->vector_buffer+start, end_pt=phi->vector_buffer+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_pt<end_pt; phi_pt+=12 ) {
     // T dir
     j = 6*(*nb_pt); nb_pt++;
     prn_T_PRECISION( pbuf, phi_pt );
@@ -544,7 +630,6 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     mvmh_PRECISION( op->prpX+j, D_pt, pbuf );
     mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9;
   }
-#endif
   
   // start communication in positive direction
   START_LOCKED_MASTER(threading)
@@ -560,10 +645,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
   END_LOCKED_MASTER(threading)
   
   // multiply with U and lift up minus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  su3_pbp_PRECISION( eta, prn, op, neighbor, start, end );
-#else
-  for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_pt<end_pt; eta_pt+=12 ) {
+  for ( eta_pt=eta->vector_buffer+start, end_pt=eta->vector_buffer+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_pt<end_pt; eta_pt+=12 ) {
     // T dir
     j = 6*(*nb_pt); nb_pt++;
     mvm_PRECISION( pbuf, D_pt, op->prnT+j );
@@ -585,7 +667,6 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 );
     pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
   }
-#endif
   
   // wait for communication in positive direction
   START_LOCKED_MASTER(threading)
@@ -596,16 +677,12 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
   END_LOCKED_MASTER(threading)
   
   // lift up plus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  pbn_PRECISION( eta, prp, start, end );
-#else
-  for ( i=start/2, eta_pt=eta+start; i<end/2; i+=6, eta_pt+=12 ) {
+  for ( i=start/2, eta_pt=eta->vector_buffer+start; i<end/2; i+=6, eta_pt+=12 ) {
     pbn_su3_T_PRECISION( op->prpT+i, eta_pt );
     pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
     pbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
     pbn_su3_X_PRECISION( op->prpX+i, eta_pt );
   }
-#endif
 #ifdef HAVE_TM1p1
   }
 #endif
@@ -618,48 +695,282 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
 }
 
 
-void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+  
+  int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var, n_vect = g.num_rhs_vect;
+  int i, j, *nb_pt;
+  buffer_PRECISION phi_pt, eta_pt, end_pt;
+  config_PRECISION D_pt;
+  //int phi_shift = (phi->num_vect == 1)?0:phi->size*n_vec, eta_shift = (eta->num_vect == 1)?0:eta->size*n_vec;
+
+  compute_core_start_end(0, nv*n, &start, &end, l, threading );
+
+  //vector_PRECISION_change_layout( phi, phi, _LV_SV_NV, no_threading );
+  //vector_PRECISION_change_layout( eta, eta, _LV_SV_NV, no_threading );
+
+  SYNC_MASTER_TO_ALL(threading)
+  clover_PRECISION_new( eta, phi, op, start, end, l, threading );
+  START_MASTER(threading)
+  PROF_PRECISION_START( _NC ); 
+  END_MASTER(threading)
+/*
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    complex_PRECISION pbuf[12];  
+    for ( i=start/2, phi_pt=phi->vector_buffer+start+phi_shift; i<end/2; i+=nv/2, phi_pt+=nv ) {
+      dprp_T_PRECISION( op->prnT+i, phi_pt );
+      dprp_Z_PRECISION( op->prnZ+i, phi_pt );
+      dprp_Y_PRECISION( op->prnY+i, phi_pt );
+      dprp_X_PRECISION( op->prnX+i, phi_pt );
+    }
+    // start communication in negative direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
+    END_LOCKED_MASTER(threading) 
+    // project plus dir and multiply with U dagger
+    for ( phi_pt=phi->vector_buffer+start+phi_shift,c end_pt=phi->vector_buffer+end+phi_shift, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_pt<end_pt; phi_pt+=nv ) {
+      // T dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      dprn_T_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpT+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpT+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpT+j+9, D_pt, pbuf+9 ); D_pt += 9;
+      // Z dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      dprn_Z_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpZ+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpZ+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpZ+j+9, D_pt, pbuf+9 ); D_pt += 9;
+      // Y dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      dprn_Y_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpY+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpY+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpY+j+9, D_pt, pbuf+9 ); D_pt += 9;
+      // X dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      dprn_X_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpX+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpX+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9;
+    }
+    // start communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
+    // wait for communication in negative direction
+    ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
+    END_LOCKED_MASTER(threading)
+     
+    // multiply with U and lift up minus dir
+    for ( eta_pt=eta->vector_buffer+start+eta_shift, end_pt=eta->vector_buffer+end+eta_shift, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_pt<end_pt; eta_pt+=nv ) {
+      // T dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnT+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnT+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnT+j+9 );
+      dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Z dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnZ+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnZ+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnZ+j+9 );
+      dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Y dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnY+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnY+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnY+j+9 );
+      dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // X dir
+      j = nv/2*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnX+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnX+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 );
+      dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
+    }
+
+    // wait for communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
+    END_LOCKED_MASTER(threading)
+      
+    // lift up plus dir
+    for ( i=start/2, eta_pt=eta->vector_buffer+start+eta_shift; i<end/2; i+=12, eta_pt+=24 ) {
+      dpbn_su3_T_PRECISION( op->prpT+i, eta_pt );
+      dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
+      dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
+      dpbn_su3_X_PRECISION( op->prpX+i, eta_pt );
+    }
+  } else {
+#endif
+*/
+  complex_PRECISION pbuf[6*n_vect];
+  for ( i=start*n_vect/2, phi_pt=phi->vector_buffer+start*n_vect; i<end*n_vect/2; i+=6*n_vect, phi_pt+=12*n_vect ) {
+    prp_T_PRECISION_new( op->prnT+i, phi_pt );
+    prp_Z_PRECISION_new( op->prnZ+i, phi_pt );
+    prp_Y_PRECISION_new( op->prnY+i, phi_pt );
+    prp_X_PRECISION_new( op->prnX+i, phi_pt );
+  }
+  // start communication in negative direction
+  START_LOCKED_MASTER(threading)
+  ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
+  ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
+  ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
+  ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
+  END_LOCKED_MASTER(threading) 
+  
+  // project plus dir and multiply with U dagger
+  for ( phi_pt=phi->vector_buffer+start*n_vect, end_pt=phi->vector_buffer+end*n_vect, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_pt<end_pt; phi_pt+=12*n_vect ) {
+    // T dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    prn_T_PRECISION_new( pbuf, phi_pt );
+    mvmh_PRECISION_new( op->prpT+j, D_pt, pbuf );
+    mvmh_PRECISION_new( op->prpT+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9;
+    // Z dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    prn_Z_PRECISION_new( pbuf, phi_pt );
+    mvmh_PRECISION_new( op->prpZ+j, D_pt, pbuf );
+    mvmh_PRECISION_new( op->prpZ+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9;
+    // Y dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    prn_Y_PRECISION_new( pbuf, phi_pt );
+    mvmh_PRECISION_new( op->prpY+j, D_pt, pbuf );
+    mvmh_PRECISION_new( op->prpY+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9;
+    // X dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    prn_X_PRECISION_new( pbuf, phi_pt );
+    mvmh_PRECISION_new( op->prpX+j, D_pt, pbuf );
+    mvmh_PRECISION_new( op->prpX+j+3*n_vect, D_pt, pbuf+3*n_vect ); D_pt += 9;
+  }
+  
+  // start communication in positive direction
+  START_LOCKED_MASTER(threading)
+  ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
+  ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
+  ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
+  ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
+  // wait for communication in negative direction
+  ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
+  ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
+  ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
+  ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
+  END_LOCKED_MASTER(threading)
+  
+  // multiply with U and lift up minus dir
+  for ( eta_pt=eta->vector_buffer+start*n_vect, end_pt=eta->vector_buffer+end*n_vect, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_pt<end_pt; eta_pt+=12*n_vect ) {
+    // T dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    mvm_PRECISION_new( pbuf, D_pt, op->prnT+j );
+    mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnT+j+3*n_vect );
+    pbp_su3_T_PRECISION_new( pbuf, eta_pt ); D_pt += 9;
+    // Z dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    mvm_PRECISION_new( pbuf, D_pt, op->prnZ+j );
+    mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnZ+j+3*n_vect );
+    pbp_su3_Z_PRECISION_new( pbuf, eta_pt ); D_pt += 9;
+    // Y dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    mvm_PRECISION_new( pbuf, D_pt, op->prnY+j );
+    mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnY+j+3*n_vect );
+    pbp_su3_Y_PRECISION_new( pbuf, eta_pt ); D_pt += 9;
+    // X dir
+    j = 6*(*nb_pt)*n_vect; nb_pt++;
+    mvm_PRECISION_new( pbuf, D_pt, op->prnX+j );
+    mvm_PRECISION_new( pbuf+3*n_vect, D_pt, op->prnX+j+3*n_vect );
+    pbp_su3_X_PRECISION_new( pbuf, eta_pt ); D_pt += 9;
+  }
+  
+  // wait for communication in positive direction
+  START_LOCKED_MASTER(threading)
+  ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
+  ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
+  ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
+  ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
+  END_LOCKED_MASTER(threading)
+  
+  // lift up plus dir
+  for ( i=start*n_vect/2, eta_pt=eta->vector_buffer+start*n_vect; i<end*n_vect/2; i+=6*n_vect, eta_pt+=12*n_vect ) {
+    pbn_su3_T_PRECISION_new( op->prpT+i, eta_pt );
+    pbn_su3_Z_PRECISION_new( op->prpZ+i, eta_pt );
+    pbn_su3_Y_PRECISION_new( op->prpY+i, eta_pt );
+    pbn_su3_X_PRECISION_new( op->prpX+i, eta_pt );
+  }
+/*#ifdef HAVE_TM1p1
+  }
+#endif*/
+  
+  //vector_PRECISION_change_layout( phi, phi, _NV_LV_SV, no_threading );
+  //vector_PRECISION_change_layout( eta, eta, _NV_LV_SV, no_threading );
+  
+  START_MASTER(threading)
+  PROF_PRECISION_STOP( _NC, 1 );
+  END_MASTER(threading)
+  
+  SYNC_MASTER_TO_ALL(threading)
+}
+
+
+
+void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
   
   ASSERT(l->depth == 0);
 
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
+  buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+  buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
-    while ( eta < eta_end ) {
-      FOR12( *eta = -(*phi); phi++; eta++; )
-      FOR12( *eta =  (*phi); phi++; eta++; )
+    while ( leta < eta_end ) {
+      FOR12( *leta = -(*lphi); lphi++; leta++; )
+      FOR12( *leta =  (*lphi); lphi++; leta++; )
     }
   } else
 #endif
-  while ( eta < eta_end ) {
-    FOR6( *eta = -(*phi); phi++; eta++; )
-    FOR6( *eta =  (*phi); phi++; eta++; )
+  while ( leta < eta_end ) {
+    FOR6( *leta = -(*lphi); lphi++; leta++; )
+    FOR6( *leta =  (*lphi); lphi++; leta++; )
   }
 }
 
-void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
   
   ASSERT(l->depth == 0);
 
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
-    vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+    buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+    buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
     complex_PRECISION b[6];
-    eta += threading->start_index[l->depth];
-    phi += threading->start_index[l->depth];
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       int i = 0;
-      FOR6( b[i] =  (*phi); phi++; i++;   );
-      FOR6( *eta = -(*phi); phi++; eta++; );
+      FOR6( b[i] =  (*lphi); lphi++; i++;   );
+      FOR6( *leta = -(*lphi); lphi++; leta++; );
       i = 0;
-      FOR6( *eta = - b[i] ; eta++; i++;   );
+      FOR6( *leta = - b[i] ; leta++; i++;   );
       i = 0;
-      FOR6( b[i] =  (*phi); phi++; i++;   );
-      FOR6( *eta =  (*phi); phi++; eta++; );
+      FOR6( b[i] =  (*lphi); lphi++; i++;   );
+      FOR6( *leta =  (*lphi); lphi++; leta++; );
       i = 0;
-      FOR6( *eta =   b[i] ; eta++; i++;   );
+      FOR6( *leta =   b[i] ; leta++; i++;   );
     }
   } else 
 #endif
@@ -671,100 +982,97 @@ void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_st
     }
 }
 
-void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
 
   ASSERT(l->depth == 0);
   
   int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
+  buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+  buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 )
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_ODD) {
-        FOR24( *eta = (*phi); phi++; eta++; );
+        FOR24( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_EVEN) {
-        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+        FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; );
       }
       i++;
     }
   else
 #endif
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_ODD) {
-        FOR12( *eta = (*phi); phi++; eta++; );
+        FOR12( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_EVEN) {
-        FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+        FOR12( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; );
       }
       i++;
     }
 }
 
-void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
 
   ASSERT(l->depth == 0);
   
   int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
+  buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+  buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 )
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_ODD){
-        FOR12( *eta = -(*phi); phi++; eta++; );
-        FOR12( *eta = (*phi); phi++; eta++; );
+        FOR12( *leta = -(*lphi); lphi++; leta++; );
+        FOR12( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_EVEN){
-        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+        FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; );
       }
       i++;
     }
   else
 #endif
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_ODD){
-        FOR6( *eta = -(*phi); phi++; eta++; );
-        FOR6( *eta = (*phi); phi++; eta++; );
+        FOR6( *leta = -(*lphi); lphi++; leta++; );
+        FOR6( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_EVEN){
-        FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+        FOR12( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; );
       }
       i++;
     }
 }
 
-void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
 
   ASSERT(l->depth == 0);
   
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
     int i = threading->start_site[l->depth];
-    vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-    eta += threading->start_index[l->depth];
-    phi += threading->start_index[l->depth];
+    buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+    buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 
     complex_PRECISION b[6];
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_ODD){
         int i = 0;
-        FOR6( b[i] =  (*phi); phi++; i++;   );
-        FOR6( *eta = -(*phi); phi++; eta++; );
+        FOR6( b[i] =  (*lphi); lphi++; i++;   );
+        FOR6( *leta = -(*lphi); lphi++; leta++; );
         i = 0;
-        FOR6( *eta = - b[i] ; eta++; i++;   );
+        FOR6( *leta = - b[i] ; leta++; i++;   );
         i = 0;
-        FOR6( b[i] =  (*phi); phi++; i++;   );
-        FOR6( *eta =  (*phi); phi++; eta++; );
+        FOR6( b[i] =  (*lphi); lphi++; i++;   );
+        FOR6( *leta =  (*lphi); lphi++; leta++; );
         i = 0;
-        FOR6( *eta =   b[i] ; eta++; i++;   );
+        FOR6( *leta =   b[i] ; leta++; i++;   );
       } else if(g.odd_even_table[i]==_EVEN){
-        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+        FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; );
       }
       i++;
     }
@@ -778,96 +1086,93 @@ void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECIS
     }
 }
 
-void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
    
   int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
+  buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+  buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 )
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN){
-        FOR24( *eta = (*phi); phi++; eta++; );
+        FOR24( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_ODD){
-        FOR24( *eta = 0; phi++; eta++; );
+        FOR24( *leta = 0; lphi++; leta++; );
       }
       i++;
     }
   else
 #endif
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN) {
-        FOR12( *eta = (*phi); phi++; eta++; );
+        FOR12( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_ODD) {
-        FOR12( *eta = 0; phi++; eta++; );
+        FOR12( *leta = 0; lphi++; leta++; );
       }
       i++;
     }
 }
 
-void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
   
   int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
+  buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+  buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 )
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN){
-        FOR12( *eta = -(*phi); phi++; eta++; );
-        FOR12( *eta = (*phi); phi++; eta++; );
+        FOR12( *leta = -(*lphi); lphi++; leta++; );
+        FOR12( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_ODD){
-        FOR24( *eta = 0; phi++; eta++; );
+        FOR24( *leta = 0; lphi++; leta++; );
       }
       i++;
     }
   else
 #endif
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN){
-        FOR6( *eta = -(*phi); phi++; eta++; );
-        FOR6( *eta = (*phi); phi++; eta++; );
+        FOR6( *leta = -(*lphi); lphi++; leta++; );
+        FOR6( *leta = (*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_ODD){
-        FOR12( *eta = 0; phi++; eta++; );
+        FOR12( *leta = 0; lphi++; leta++; );
       }
       i++;
     }
 }
 
-void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
 
   ASSERT(l->depth == 0);
   
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
     int i = threading->start_site[l->depth];
-    vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-    eta += threading->start_index[l->depth];
-    phi += threading->start_index[l->depth];
+    buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+    buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
     
     complex_PRECISION b[6];
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN){
         int i = 0;
-        FOR6( b[i] =  (*phi); phi++; i++;   );
-        FOR6( *eta = -(*phi); phi++; eta++; );
+        FOR6( b[i] =  (*lphi); lphi++; i++;   );
+        FOR6( *leta = -(*lphi); lphi++; leta++; );
         i = 0;
-        FOR6( *eta = - b[i] ; eta++; i++;   );
+        FOR6( *leta = - b[i] ; leta++; i++;   );
         i = 0;
-        FOR6( b[i] =  (*phi); phi++; i++;   );
-        FOR6( *eta =  (*phi); phi++; eta++; );
+        FOR6( b[i] =  (*lphi); lphi++; i++;   );
+        FOR6( *leta =  (*lphi); lphi++; leta++; );
         i = 0;
-        FOR6( *eta =   b[i] ; eta++; i++;   );
+        FOR6( *leta =   b[i] ; leta++; i++;   );
       } else if(g.odd_even_table[i]==_ODD){
-        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+        FOR24( *leta = _COMPLEX_PRECISION_ZERO; lphi++; leta++; );
       }
       i++;
     }
@@ -881,40 +1186,39 @@ void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISI
     }
 }
 
-void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, 
+void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, complex_double even, complex_double odd, 
                                level_struct *l, struct Thread *threading ) {
    
   int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
+  buffer_PRECISION eta_end = eta->vector_buffer + threading->end_index[l->depth];
+  buffer_PRECISION leta = eta->vector_buffer + threading->start_index[l->depth], lphi = phi->vector_buffer + threading->start_index[l->depth];
 
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 )
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN){
-        FOR24( *eta = even*(*phi); phi++; eta++; );
+        FOR24( *leta = even*(*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_ODD){
-        FOR24( *eta = odd*(*phi); phi++; eta++; );
+        FOR24( *leta = odd*(*lphi); lphi++; leta++; );
       }
       i++;
     }
   else
 #endif
-    while ( eta < eta_end ) {
+    while ( leta < eta_end ) {
       if(g.odd_even_table[i]==_EVEN) {
-        FOR12( *eta = even*(*phi); phi++; eta++; );
+        FOR12( *leta = even*(*lphi); lphi++; leta++; );
       }
       else if(g.odd_even_table[i]==_ODD) {
-        FOR12( *eta = odd*(*phi); phi++; eta++; );
+        FOR12( *leta = odd*(*lphi); lphi++; leta++; );
       }
       i++;
     }
 }
 
 
-void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) {
+void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) {
 
 #ifdef HAVE_TM1p1
 
@@ -924,26 +1228,27 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION
    *        spin2and3 of flav1
    *        spin2and3 of flav2
    */
-  vector_PRECISION serial_end;
+  buffer_PRECISION serial_end;
+  buffer_PRECISION serial_pt = serial->vector_buffer, flav1_pt = flav1->vector_buffer, flav2_pt = flav2->vector_buffer;
   
   if( g.n_flavours == 2 ) {
-    serial_end = serial + threading->end_index[l->depth];
-    serial += threading->start_index[l->depth];
-    flav1 += threading->start_index[l->depth]/2;
-    flav2 += threading->start_index[l->depth]/2;
+    serial_end = serial->vector_buffer + threading->end_index[l->depth];
+    serial_pt += threading->start_index[l->depth];
+    flav1_pt += threading->start_index[l->depth]/2;
+    flav2_pt += threading->start_index[l->depth]/2;
   }
   else {
-    serial_end = serial + threading->end_index[l->depth]*2;
-    serial += threading->start_index[l->depth]*2;
-    flav1 += threading->start_index[l->depth];
-    flav2 += threading->start_index[l->depth];
+    serial_end = serial->vector_buffer + threading->end_index[l->depth]*2;
+    serial_pt += threading->start_index[l->depth]*2;
+    flav1_pt += threading->start_index[l->depth];
+    flav2_pt += threading->start_index[l->depth];
   }
 
-  while ( serial < serial_end ) {
-    FOR6( *serial = (*flav1); serial++; flav1++; )
-    FOR6( *serial = (*flav2); serial++; flav2++; )
-    FOR6( *serial = (*flav1); serial++; flav1++; )
-    FOR6( *serial = (*flav2); serial++; flav2++; )
+  while ( serial_pt < serial_end ) {
+    FOR6( *serial_pt = (*flav1_pt); serial_pt++; flav1_pt++; )
+    FOR6( *serial_pt = (*flav2_pt); serial_pt++; flav2_pt++; )
+    FOR6( *serial_pt = (*flav1_pt); serial_pt++; flav1_pt++; )
+    FOR6( *serial_pt = (*flav2_pt); serial_pt++; flav2_pt++; )
   }
 #else
   START_MASTER(threading)
@@ -953,29 +1258,30 @@ void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION
     
 }
 
-void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) {
+void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading ) {
 
 #ifdef HAVE_TM1p1
-  vector_PRECISION serial_end;
-  
+  buffer_PRECISION serial_end;
+  buffer_PRECISION serial_pt = serial->vector_buffer, flav1_pt = flav1->vector_buffer, flav2_pt = flav2->vector_buffer;
+
   if( g.n_flavours == 2 ) {
-    serial_end = serial + threading->end_index[l->depth];
-    serial += threading->start_index[l->depth];
-    flav1 += threading->start_index[l->depth]/2;
-    flav2 += threading->start_index[l->depth]/2;
+    serial_end = serial->vector_buffer + threading->end_index[l->depth];
+    serial_pt += threading->start_index[l->depth];
+    flav1_pt += threading->start_index[l->depth]/2;
+    flav2_pt += threading->start_index[l->depth]/2;
   }
   else {
-    serial_end = serial + threading->end_index[l->depth]*2;
-    serial += threading->start_index[l->depth]*2;
-    flav1 += threading->start_index[l->depth];
-    flav2 += threading->start_index[l->depth];
+    serial_end = serial->vector_buffer + threading->end_index[l->depth]*2;
+    serial_pt += threading->start_index[l->depth]*2;
+    flav1_pt += threading->start_index[l->depth];
+    flav2_pt += threading->start_index[l->depth];
   }
 
-  while ( serial < serial_end ) {
-    FOR6( *flav1 = (*serial); serial++; flav1++; )
-    FOR6( *flav2 = (*serial); serial++; flav2++; )
-    FOR6( *flav1 = (*serial); serial++; flav1++; )
-    FOR6( *flav2 = (*serial); serial++; flav2++; )
+  while ( serial_pt < serial_end ) {
+    FOR6( *flav1_pt = (*serial_pt); serial_pt++; flav1_pt++; )
+    FOR6( *flav2_pt = (*serial_pt); serial_pt++; flav2_pt++; )
+    FOR6( *flav1_pt = (*serial_pt); serial_pt++; flav1_pt++; )
+    FOR6( *flav2_pt = (*serial_pt); serial_pt++; flav2_pt++; )
   }
 #else
   START_MASTER(threading)
@@ -985,28 +1291,28 @@ void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION
     
 }
 
-void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   d_plus_clover_PRECISION( eta, phi, op, l, threading );
   SYNC_CORES(threading)
   gamma5_PRECISION( eta, eta, l, threading );
   SYNC_CORES(threading)
 }
 
-void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ) {
+void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l ) {
 
-  vector_PRECISION eta_end = eta1 + l->inner_vector_size;
-  
-  while ( eta1 < eta_end ) {
-    FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; );
-    FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; );
+  buffer_PRECISION eta_end = eta1->vector_buffer + l->inner_vector_size;
+  buffer_PRECISION eta1_pt = eta1->vector_buffer, eta2_pt = eta2->vector_buffer, phi_pt = phi->vector_buffer;
+  while ( eta1_pt < eta_end ) {
+    FOR6( *eta1_pt = (*phi_pt)*(*diag); *eta2_pt = _COMPLEX_PRECISION_ZERO; eta1_pt++; eta2_pt++; phi_pt++; diag++; );
+    FOR6( *eta2_pt = (*phi_pt)*(*diag); *eta1_pt = _COMPLEX_PRECISION_ZERO; eta1_pt++; eta2_pt++; phi_pt++; diag++; );
   }
 }
 
 
-void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l ) {
+void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l ) {
   
   int i, length, index1, index2, *index_dir, *neighbor = s->op.neighbor_table;
-  vector_PRECISION eta1_pt, eta2_pt, phi_pt;
+  buffer_PRECISION eta1_pt, eta2_pt, phi_pt;
   complex_PRECISION buffer1[12], buffer2[12];
   config_PRECISION D_pt, D = s->op.D;
     
@@ -1018,84 +1324,84 @@ void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION
   length = l->is_PRECISION.agg_length[T]; index_dir = l->is_PRECISION.agg_index[T];
   for ( i=0; i<length; i++ ) {
     index1 = index_dir[i]; index2 = neighbor[4*index1 + T];
-    phi_pt = phi + 12*index2; D_pt = D + 36*index1+9*T;
+    phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1+9*T;
     mvm_PRECISION( buffer1, D_pt, phi_pt );
     mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
     mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
     mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-    phi_pt = phi + 12*index1;
+    phi_pt = phi->vector_buffer + 12*index1;
     mvmh_PRECISION( buffer2, D_pt, phi_pt );
     mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 );
     mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 );
     mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 );
-    eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+    eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
     twospin_p_T_PRECISION( eta1_pt, eta2_pt, buffer1 );
-    eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2;
+    eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2;
     twospin_n_T_PRECISION( eta1_pt, eta2_pt, buffer2 );
   }
   // Z dir
   length = l->is_PRECISION.agg_length[Z]; index_dir = l->is_PRECISION.agg_index[Z];
   for ( i=0; i<length; i++ ) {
     index1 = index_dir[i]; index2 = neighbor[4*index1 + Z];
-    phi_pt = phi + 12*index2; D_pt = D + 36*index1+9*Z;
+    phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1+9*Z;
     mvm_PRECISION( buffer1, D_pt, phi_pt );
     mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
     mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
     mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-    phi_pt = phi + 12*index1;
+    phi_pt = phi->vector_buffer + 12*index1;
     mvmh_PRECISION( buffer2, D_pt, phi_pt );
     mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 );
     mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 );
     mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 );
-    eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+    eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
     twospin_p_Z_PRECISION( eta1_pt, eta2_pt, buffer1 );
-    eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2;
+    eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2;
     twospin_n_Z_PRECISION( eta1_pt, eta2_pt, buffer2 );
   }
   // Y dir
   length = l->is_PRECISION.agg_length[Y]; index_dir = l->is_PRECISION.agg_index[Y];
   for ( i=0; i<length; i++ ) {
     index1 = index_dir[i]; index2 = neighbor[4*index1 + Y];
-    phi_pt = phi + 12*index2; D_pt = D + 36*index1+9*Y;
+    phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1+9*Y;
     mvm_PRECISION( buffer1, D_pt, phi_pt );
     mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
     mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
     mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-    phi_pt = phi + 12*index1;
+    phi_pt = phi->vector_buffer + 12*index1;
     mvmh_PRECISION( buffer2, D_pt, phi_pt );
     mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 );
     mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 );
     mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 );
-    eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+    eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
     twospin_p_Y_PRECISION( eta1_pt, eta2_pt, buffer1 );
-    eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2;
+    eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2;
     twospin_n_Y_PRECISION( eta1_pt, eta2_pt, buffer2 );
   }
   // X dir
   length = l->is_PRECISION.agg_length[X]; index_dir = l->is_PRECISION.agg_index[X];
   for ( i=0; i<length; i++ ) {
     index1 = index_dir[i]; index2 = neighbor[4*index1 + X];
-    phi_pt = phi + 12*index2; D_pt = D + 36*index1+9*X;
+    phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1+9*X;
     mvm_PRECISION( buffer1, D_pt, phi_pt );
     mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
     mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
     mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-    phi_pt = phi + 12*index1;
+    phi_pt = phi->vector_buffer + 12*index1;
     mvmh_PRECISION( buffer2, D_pt, phi_pt );
     mvmh_PRECISION( buffer2+3, D_pt, phi_pt+3 );
     mvmh_PRECISION( buffer2+6, D_pt, phi_pt+6 );
     mvmh_PRECISION( buffer2+9, D_pt, phi_pt+9 );
-    eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+    eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
     twospin_p_X_PRECISION( eta1_pt, eta2_pt, buffer1 );
-    eta1_pt = eta1 + 12*index2; eta2_pt = eta2 + 12*index2;
+    eta1_pt = eta1->vector_buffer + 12*index2; eta2_pt = eta2->vector_buffer + 12*index2;
     twospin_n_X_PRECISION( eta1_pt, eta2_pt, buffer2 );
   }
 }
 
-void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) {
+void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) {
   
   int i, length, index1, index2, *index_dir, *neighbor;
-  vector_PRECISION eta1_pt, eta2_pt, phi_pt;
+  buffer_PRECISION eta1_pt, eta2_pt, phi_pt;
   complex_PRECISION buffer1[12];
   config_PRECISION D_pt, D = s->op.D;
   
@@ -1108,54 +1414,54 @@ void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta
     // T dir
     for ( i=0; i<length; i++ ) {
       index1 = index_dir[i]; index2 = neighbor[i];
-      phi_pt = phi + 12*index2; D_pt = D + 36*index1 + 9*T;
+      phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1 + 9*T;
       mvm_PRECISION( buffer1, D_pt, phi_pt );
       mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
       mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
       mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-      eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;   
+      eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;   
       twospin2_p_T_PRECISION( eta1_pt, eta2_pt, buffer1 );
     }
   } else if ( mu == Z ) {
     // Z dir
     for ( i=0; i<length; i++ ) {
       index1 = index_dir[i]; index2 = neighbor[i];
-      phi_pt = phi + 12*index2; D_pt = D + 36*index1 + 9*Z;
+      phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1 + 9*Z;
       mvm_PRECISION( buffer1, D_pt, phi_pt );
       mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
       mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
       mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-      eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+      eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
       twospin2_p_Z_PRECISION( eta1_pt, eta2_pt, buffer1 );
     }
   } else if ( mu == Y ) {
     // Y dir
     for ( i=0; i<length; i++ ) {
       index1 = index_dir[i]; index2 = neighbor[i];
-      phi_pt = phi + 12*index2; D_pt = D + 36*index1 + 9*Y;
+      phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1 + 9*Y;
       mvm_PRECISION( buffer1, D_pt, phi_pt );
       mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
       mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
       mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-      eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+      eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
       twospin2_p_Y_PRECISION( eta1_pt, eta2_pt, buffer1 );
     }
   } else if ( mu == X ) {
     // X dir
     for ( i=0; i<length; i++ ) {
       index1 = index_dir[i]; index2 = neighbor[i];
-      phi_pt = phi + 12*index2; D_pt = D + 36*index1 + 9*X;
+      phi_pt = phi->vector_buffer + 12*index2; D_pt = D + 36*index1 + 9*X;
       mvm_PRECISION( buffer1, D_pt, phi_pt );
       mvm_PRECISION( buffer1+3, D_pt, phi_pt+3 );
       mvm_PRECISION( buffer1+6, D_pt, phi_pt+6 );
       mvm_PRECISION( buffer1+9, D_pt, phi_pt+9 );
-      eta1_pt = eta1 + 12*index1; eta2_pt = eta2 + 12*index1;
+      eta1_pt = eta1->vector_buffer + 12*index1; eta2_pt = eta2->vector_buffer + 12*index1;
       twospin2_p_X_PRECISION( eta1_pt, eta2_pt, buffer1 );
     }
   }
 }
  
-void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l) {
+void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l) {
   int t, z, y, x, i;
   int *gl=l->global_lattice, sl[4];
   double phase[4];
@@ -1174,10 +1480,46 @@ void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISIO
           twisted_bc = exp(I*phase[X]);
 #ifdef HAVE_TM1p1
           if( g.n_flavours == 2 ) {
-            FOR24( *eta = (*phi)*twisted_bc; phi++; eta++; );
+            FOR24( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; );
           } else
 #endif
-            { FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; ) }
+            { FOR12( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; ) }
+        }
+      }
+    }
+  }
+}
+
+void apply_twisted_bc_to_vector_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l) {
+  int t, z, y, x, i, j;
+  int n_vect=g.num_rhs_vect;
+  int *gl=l->global_lattice, sl[4];
+  double phase[4];
+  complex_double twisted_bc;
+  for (i=0; i<4; i++)
+    sl[i] = l->local_lattice[i]*g.my_coords[i];
+  
+  for (t=0; t<l->local_lattice[0]; t++) {
+    phase[T] = theta[T]*((double)sl[T]+t)/(double)gl[T];
+    for (z=0; z<l->local_lattice[1]; z++) {
+      phase[Z] = phase[T] + theta[Z]*((double)sl[Z]+z)/(double)gl[Z];
+      for (y=0; y<l->local_lattice[2]; y++) {
+        phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y];
+        for (x=0; x<l->local_lattice[3]; x++) {
+          phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X];
+          twisted_bc = exp(I*phase[X]);
+/*#ifdef HAVE_TM1p1
+          if( g.n_flavours == 2 ) {
+            FOR24( *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc; phi->vector_buffer++; eta->vector_buffer++; );
+          } else
+#endif*/
+	  for (i=0; i<12; i++){
+	    for(j=0; j<n_vect; j++){
+              *eta->vector_buffer = (*phi->vector_buffer)*twisted_bc;
+	      phi->vector_buffer++;
+	      eta->vector_buffer++;
+            }
+          }
         }
       }
     }
@@ -1188,13 +1530,9 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) {
 
   if ( l->level > 0 ) {
     if ( !l->idle ) {
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
-      coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
-      START_LOCKED_MASTER(threading)
-#else
       START_LOCKED_MASTER(threading)
       coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
-#endif
+
       conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level );
       END_LOCKED_MASTER(threading)
       if ( !l->next_level->idle && l->next_level->level > 0 ) {
@@ -1457,73 +1795,81 @@ void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l
 #ifdef HAVE_TM1p1
   double diff;
   
-  vector_double vd1=NULL, vd2, vd3, vd4, vdd1, vdd2, vdd3, vdd4;
-  vector_PRECISION vpp1=NULL, vpp2;
+  vector_double vd[4], vdd[4];
+  vector_PRECISION vpp[2];
+
+  for(int i=0; i<4; i++){                                                                 
+    vector_double_init( &vd[i] );                                                         
+    vector_double_alloc( &vd[i], _INNER, 1, l, threading );
+    vector_double_init( &vdd[i] );
+    vector_double_alloc( &vdd[i], _INNER, 2, l, threading );                               
+  }                                                                                       
+                                                                                          
+  for(int i=0; i<2; i++){                                                                 
+    vector_PRECISION_init( &vpp[i] );                                                      
+    vector_PRECISION_alloc( &vpp[i], _INNER, 2, l, threading );                            
+  }
 
   ASSERT(g.n_flavours==2);
 
   data_layout_n_flavours( 1, l, threading );
 
-  int ivs = l->inner_vector_size;
-  
-  PUBLIC_MALLOC( vd1, complex_double, 4*ivs + 2*4*ivs );
-  PUBLIC_MALLOC( vpp1, complex_PRECISION, 2*2*ivs );
-
-  vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs;
-  vdd1 = vd4 + ivs; vdd2 = vdd1 + 2*ivs; vdd3 = vdd2 + 2*ivs; vdd4 = vdd3 + 2*ivs;
-  vpp2 = vpp1 + 2*ivs;
-  
   START_LOCKED_MASTER(threading)
-  vector_double_define_random( vd1, 0, l->inner_vector_size, l );
-  vector_double_define_random( vd2, 0, l->inner_vector_size, l );
-  apply_operator_double( vd3, vd1, &(g.p), l, no_threading );
+  vector_double_define_random( &vd[0], 0, l->inner_vector_size, l );
+  vector_double_define_random( &vd[1], 0, l->inner_vector_size, l );
+  apply_operator_double( &vd[2], &vd[0], &(g.p), l, no_threading );
 #ifdef HAVE_TM
-  vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); 
+  buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); 
 #endif
-  apply_operator_double( vd4, vd2, &(g.p), l, no_threading );
+  apply_operator_double( &vd[3], &vd[1], &(g.p), l, no_threading );
 #ifdef HAVE_TM
-  vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); 
+  buffer_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); 
 #endif
-  add_diagonal_double( vd3, vd2, g.op_double.epsbar_term, l->inner_vector_size );
-  add_diagonal_double( vd4, vd1, g.op_double.epsbar_term, l->inner_vector_size );
+  add_diagonal_double( &vd[2], &vd[1], g.op_double.epsbar_term, l->inner_vector_size );
+  add_diagonal_double( &vd[3], &vd[0], g.op_double.epsbar_term, l->inner_vector_size );
 
-  two_flavours_to_serial_double( vd1, vd2, vdd1, l, no_threading );
-  two_flavours_to_serial_double( vd3, vd4, vdd2, l, no_threading );
+  two_flavours_to_serial_double( &vd[0], &vd[1], &vdd[0], l, no_threading );
+  two_flavours_to_serial_double( &vd[2], &vd[3], &vdd[1], l, no_threading );
   END_LOCKED_MASTER(threading)
 
   data_layout_n_flavours( 2, l, threading );
 
   START_LOCKED_MASTER(threading)
-  trans_PRECISION( vpp1, vdd1, op->translation_table, l, no_threading );
-  apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, no_threading );
-  trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, no_threading );
+  trans_PRECISION( &vpp[0], &vdd[0], op->translation_table, l, no_threading );
+  apply_operator_PRECISION( &vpp[1], &vpp[0], &(l->p_PRECISION), l, no_threading );
+  trans_back_PRECISION( &vdd[2], &vpp[1], op->translation_table, l, no_threading );
   
-  vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l );
-  diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) /
-    global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading );
+  vector_double_minus( &vdd[3], &vdd[2], &vdd[1], 0, l->inner_vector_size, l );
+  diff = global_norm_double( &vdd[3], 0, l->inner_vector_size, l, no_threading ) /
+    global_norm_double( &vdd[2], 0, l->inner_vector_size, l, no_threading );
   
   test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION: %le\n", l->depth, diff );
   END_LOCKED_MASTER(threading)
 
   if(threading->n_core > 1) {
-    trans_PRECISION( vpp1, vdd1, op->translation_table, l, threading );
-    apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, threading );
-    trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, threading );
+    trans_PRECISION( &vpp[0], &vdd[0], op->translation_table, l, threading );
+    apply_operator_PRECISION( &vpp[1], &vpp[0], &(l->p_PRECISION), l, threading );
+    trans_back_PRECISION( &vdd[2], &vpp[1], op->translation_table, l, threading );
     
     SYNC_MASTER_TO_ALL(threading)
     SYNC_CORES(threading)
 
     START_LOCKED_MASTER(threading)
-    vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l );
-    diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) /
-      global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading );
+    vector_double_minus( &vdd[3], &vdd[2], &vdd[1], 0, l->inner_vector_size, l );
+    diff = global_norm_double( &vdd[3], 0, l->inner_vector_size, l, no_threading ) /
+      global_norm_double( &vdd[2], 0, l->inner_vector_size, l, no_threading );
     
     test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION with threading: %le\n", l->depth, diff );
     END_LOCKED_MASTER(threading)
   }    
   
-  PUBLIC_FREE( vd1, complex_double, 4*ivs + 2*4*ivs );
-  PUBLIC_FREE( vpp1, complex_PRECISION, 2*2*ivs );
+  for(int i=0; i<4; i++){
+    vector_double_free( &vd[i], l, threading );
+    vector_double_free( &vdd[i], l, threading );
+  }
+
+  for(int i=0; i<2; i++)
+    vector_PRECISION_free( &vpp[i], l, threading );
 
   START_LOCKED_MASTER(threading)
   if ( g.method >=4 && g.odd_even )
diff --git a/src/dirac_generic.h b/src/dirac_generic.h
index 1224f78..0d76a73 100644
--- a/src/dirac_generic.h
+++ b/src/dirac_generic.h
@@ -24,20 +24,25 @@
 
   struct Thread;
   
-  void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading );
-  void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading );
+  void two_flavours_to_serial_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading );
+  void serial_to_two_flavours_PRECISION( vector_PRECISION *flav1, vector_PRECISION *flav2, vector_PRECISION *serial, level_struct *l, struct Thread *threading );
 
 
-  void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading );
+  void clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading );
+  void clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading );
+
+  void d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void d_plus_clover_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void d_plus_clover_dagger_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void g5D_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void block_d_plus_clover_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
+  void diagonal_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, config_PRECISION diag, level_struct *l );
+  void d_plus_clover_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l );
+  void d_neighbor_aggregate_PRECISION( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l );
   
-  void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l );
-  void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l );
-  void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l );
-  void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l);
+  void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l);
+  void apply_twisted_bc_to_vector_PRECISION_new( vector_PRECISION *eta, vector_PRECISION *phi, double *theta, level_struct *l);
+
   void operator_updates_PRECISION( level_struct *l, struct Thread *threading );
   void m0_update_PRECISION( PRECISION m0,operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op,
@@ -46,22 +51,22 @@
                                     level_struct *l, struct Thread *threading );
   void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
 
-  void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd,
+  void gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void tau1_gamma5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
+  void scale_even_odd_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, complex_double even, complex_double odd,
                                  level_struct *l, struct Thread *threading );
 
 
-  static inline void add_diagonal_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi,
+  static inline void add_diagonal_PRECISION( const vector_PRECISION *eta, const vector_PRECISION *phi,
              const config_PRECISION diag, const int length ) {
     config_PRECISION diag_pt = diag;
-    vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length;
+    buffer_PRECISION phi_pt = phi->vector_buffer, eta_pt = eta->vector_buffer, eta_end = eta->vector_buffer + length;
 #ifdef HAVE_TM1p1
     if(g.n_flavours == 2)
       while ( eta_pt < eta_end ) {
@@ -79,10 +84,10 @@
   }
 
 #ifdef HAVE_TM1p1
-  static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi,
+  static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION *eta, const vector_PRECISION *phi,
              const config_PRECISION diag, const int length ) {
     config_PRECISION diag_pt = diag;
-    vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length;
+    buffer_PRECISION phi_pt = phi->vector_buffer, eta_pt = eta->vector_buffer, eta_end = eta->vector_buffer + length;
     while ( eta_pt < eta_end ) { 
       phi_pt += 6;
       FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
@@ -95,7 +100,7 @@
 #endif
 
   // eta = D*phi
-  static inline void mvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) {
+  static inline void mvm_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) {
     eta[0]  = D[0]*phi[0];
     eta[0] += D[1]*phi[1];
     eta[0] += D[2]*phi[2];
@@ -106,9 +111,22 @@
     eta[2] += D[7]*phi[1];
     eta[2] += D[8]*phi[2];
   }
+
+  static inline void mvm_PRECISION_new( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, eta[0*n_vect+j+jj]  = D[0]*phi[0*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += D[1]*phi[1*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += D[2]*phi[2*n_vect+j+jj];
+                               eta[1*n_vect+j+jj]  = D[3]*phi[0*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += D[4]*phi[1*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += D[5]*phi[2*n_vect+j+jj];
+                               eta[2*n_vect+j+jj]  = D[6]*phi[0*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += D[7]*phi[1*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += D[8]*phi[2*n_vect+j+jj];)
+  }
   
   // eta = D**H*phi
-  static inline void mvmh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) {
+  static inline void mvmh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) {
     eta[0]  = conj_PRECISION(D[0])*phi[0];
     eta[1]  = conj_PRECISION(D[1])*phi[0];
     eta[2]  = conj_PRECISION(D[2])*phi[0];
@@ -119,9 +137,22 @@
     eta[1] += conj_PRECISION(D[7])*phi[2];
     eta[2] += conj_PRECISION(D[8])*phi[2];
   }
+
+  static inline void mvmh_PRECISION_new( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, eta[0*n_vect+j+jj]  = conj_PRECISION(D[0])*phi[0*n_vect+j+jj];
+                               eta[1*n_vect+j+jj]  = conj_PRECISION(D[1])*phi[0*n_vect+j+jj];
+                               eta[2*n_vect+j+jj]  = conj_PRECISION(D[2])*phi[0*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += conj_PRECISION(D[3])*phi[1*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += conj_PRECISION(D[4])*phi[1*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += conj_PRECISION(D[5])*phi[1*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += conj_PRECISION(D[6])*phi[2*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += conj_PRECISION(D[7])*phi[2*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += conj_PRECISION(D[8])*phi[2*n_vect+j+jj];)
+  }
   
   // eta = -D*phi
-  static inline void nmvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) {
+  static inline void nmvm_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) {
     eta[0]  = - D[0]*phi[0];
     eta[0] -=   D[1]*phi[1];
     eta[0] -=   D[2]*phi[2];
@@ -134,7 +165,7 @@
   }
 
   // eta = -D**H*phi
-  static inline void nmvmh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) {
+  static inline void nmvmh_PRECISION( const buffer_PRECISION eta, const complex_PRECISION *D, const buffer_PRECISION phi ) {
     eta[0]  = - conj_PRECISION(D[0])*phi[0];
     eta[1]  = - conj_PRECISION(D[1])*phi[0];
     eta[2]  = - conj_PRECISION(D[2])*phi[0];
@@ -147,7 +178,7 @@
   }
 
   // 1 - gamma_T
-  static inline void prp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void prp_T_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[0] = l_pt[0] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO];
     prp_pt[1] = l_pt[1] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+1];
     prp_pt[2] = l_pt[2] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+2];
@@ -155,9 +186,19 @@
     prp_pt[4] = l_pt[4] -GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO+1];
     prp_pt[5] = l_pt[5] -GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO+2];
   }
+ 
+  static inline void prp_T_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO*n_vect+j+jj];
+                               prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+1)*n_vect+j+jj];
+                               prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+2)*n_vect+j+jj];
+                               prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO*n_vect+j+jj];
+                               prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+1)*n_vect+j+jj];
+                               prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+2)*n_vect+j+jj];)
+  }
 
   // 1 + gamma_T
-  static inline void prn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void prn_T_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[0] = l_pt[0] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO];
     prn_pt[1] = l_pt[1] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+1];
     prn_pt[2] = l_pt[2] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO+2];
@@ -166,8 +207,18 @@
     prn_pt[5] = l_pt[5] +GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO+2];
   }
 
+  static inline void prn_T_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO*n_vect+j+jj];
+                               prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+1)*n_vect+j+jj];
+                               prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_T_SPIN0_VAL*l_pt[(3*GAMMA_T_SPIN0_CO+2)*n_vect+j+jj];
+                               prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_T_SPIN1_VAL*l_pt[3*GAMMA_T_SPIN1_CO*n_vect+j+jj];
+                               prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+1)*n_vect+j+jj];
+                               prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_T_SPIN1_VAL*l_pt[(3*GAMMA_T_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
   // - (1 - gamma_T)
-  static inline void pbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void pbp_su3_T_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[0];
     l_pt[ 1] -= prp_su3_pt[1];
     l_pt[ 2] -= prp_su3_pt[2];
@@ -182,8 +233,24 @@
     l_pt[11] += GAMMA_T_SPIN3_VAL*prp_su3_pt[3*GAMMA_T_SPIN3_CO+2];
   }
 
+  static inline void pbp_su3_T_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] += GAMMA_T_SPIN2_VAL*prp_su3_pt[3*GAMMA_T_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] += GAMMA_T_SPIN2_VAL*prp_su3_pt[(3*GAMMA_T_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] += GAMMA_T_SPIN2_VAL*prp_su3_pt[(3*GAMMA_T_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] += GAMMA_T_SPIN3_VAL*prp_su3_pt[3*GAMMA_T_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] += GAMMA_T_SPIN3_VAL*prp_su3_pt[(3*GAMMA_T_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] += GAMMA_T_SPIN3_VAL*prp_su3_pt[(3*GAMMA_T_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
   // -(1 + gamma_T)
-  static inline void pbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void pbn_su3_T_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[0];
     l_pt[ 1] -= prn_su3_pt[1];
     l_pt[ 2] -= prn_su3_pt[2];
@@ -198,7 +265,23 @@
     l_pt[11] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[3*GAMMA_T_SPIN3_CO+2];
   }
 
-  static inline void prp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void pbn_su3_T_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[3*GAMMA_T_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[(3*GAMMA_T_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[(3*GAMMA_T_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[3*GAMMA_T_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[(3*GAMMA_T_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[(3*GAMMA_T_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void prp_Z_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[0] = l_pt[0] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO];
     prp_pt[1] = l_pt[1] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+1];
     prp_pt[2] = l_pt[2] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+2];
@@ -207,7 +290,17 @@
     prp_pt[5] = l_pt[5] -GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO+2];
   }
 
-  static inline void prn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void prp_Z_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO*n_vect+j+jj];
+                               prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+1)*n_vect+j+jj];
+                               prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+2)*n_vect+j+jj];
+                               prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO*n_vect+j+jj];
+                               prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+1)*n_vect+j+jj];
+                               prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void prn_Z_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[0] = l_pt[0] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO];
     prn_pt[1] = l_pt[1] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+1];
     prn_pt[2] = l_pt[2] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO+2];
@@ -216,7 +309,17 @@
     prn_pt[5] = l_pt[5] +GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO+2];
   }
 
-  static inline void pbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void prn_Z_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_Z_SPIN0_VAL*l_pt[3*GAMMA_Z_SPIN0_CO*n_vect+j+jj];
+                               prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+1)*n_vect+j+jj];
+                               prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_Z_SPIN0_VAL*l_pt[(3*GAMMA_Z_SPIN0_CO+2)*n_vect+j+jj];
+                               prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_Z_SPIN1_VAL*l_pt[3*GAMMA_Z_SPIN1_CO*n_vect+j+jj];
+                               prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+1)*n_vect+j+jj];
+                               prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_Z_SPIN1_VAL*l_pt[(3*GAMMA_Z_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void pbp_su3_Z_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[0];
     l_pt[ 1] -= prp_su3_pt[1];
     l_pt[ 2] -= prp_su3_pt[2];
@@ -231,7 +334,23 @@
     l_pt[11] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[3*GAMMA_Z_SPIN3_CO+2];
   }
 
-  static inline void pbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void pbp_su3_Z_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[3*GAMMA_Z_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[3*GAMMA_Z_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Z_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void pbn_su3_Z_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[0];
     l_pt[ 1] -= prn_su3_pt[1];
     l_pt[ 2] -= prn_su3_pt[2];
@@ -246,7 +365,23 @@
     l_pt[11] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[3*GAMMA_Z_SPIN3_CO+2];
   }
 
-  static inline void prp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void pbn_su3_Z_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[3*GAMMA_Z_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[3*GAMMA_Z_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Z_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void prp_Y_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[0] = l_pt[0] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO];
     prp_pt[1] = l_pt[1] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+1];
     prp_pt[2] = l_pt[2] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+2];
@@ -255,7 +390,17 @@
     prp_pt[5] = l_pt[5] -GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO+2];
   }
 
-  static inline void prn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void prp_Y_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO*n_vect+j+jj];
+                               prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+1)*n_vect+j+jj];
+                               prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+2)*n_vect+j+jj];
+                               prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO*n_vect+j+jj];
+                               prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+1)*n_vect+j+jj];
+                               prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void prn_Y_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[0] = l_pt[0] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO];
     prn_pt[1] = l_pt[1] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+1];
     prn_pt[2] = l_pt[2] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO+2];
@@ -264,7 +409,17 @@
     prn_pt[5] = l_pt[5] +GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO+2];
   }
 
-  static inline void pbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void prn_Y_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_Y_SPIN0_VAL*l_pt[3*GAMMA_Y_SPIN0_CO*n_vect+j+jj];
+                               prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+1)*n_vect+j+jj];
+                               prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_Y_SPIN0_VAL*l_pt[(3*GAMMA_Y_SPIN0_CO+2)*n_vect+j+jj];
+                               prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_Y_SPIN1_VAL*l_pt[3*GAMMA_Y_SPIN1_CO*n_vect+j+jj];
+                               prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+1)*n_vect+j+jj];
+                               prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_Y_SPIN1_VAL*l_pt[(3*GAMMA_Y_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void pbp_su3_Y_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[0];
     l_pt[ 1] -= prp_su3_pt[1];
     l_pt[ 2] -= prp_su3_pt[2];
@@ -279,7 +434,23 @@
     l_pt[11] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[3*GAMMA_Y_SPIN3_CO+2];
   }
 
-  static inline void pbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void pbp_su3_Y_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[3*GAMMA_Y_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[3*GAMMA_Y_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[(3*GAMMA_Y_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void pbn_su3_Y_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[0];
     l_pt[ 1] -= prn_su3_pt[1];
     l_pt[ 2] -= prn_su3_pt[2];
@@ -294,7 +465,23 @@
     l_pt[11] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[3*GAMMA_Y_SPIN3_CO+2];
   }
 
-  static inline void prp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void pbn_su3_Y_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[3*GAMMA_Y_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[3*GAMMA_Y_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[(3*GAMMA_Y_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void prp_X_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[0] = l_pt[0] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO];
     prp_pt[1] = l_pt[1] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+1];
     prp_pt[2] = l_pt[2] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+2];
@@ -303,7 +490,17 @@
     prp_pt[5] = l_pt[5] -GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO+2];
   }
 
-  static inline void prn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void prp_X_PRECISION_new( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prp_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] -GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO*n_vect+j+jj];
+                               prp_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] -GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+1)*n_vect+j+jj];
+                               prp_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] -GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+2)*n_vect+j+jj];
+                               prp_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] -GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO*n_vect+j+jj];
+                               prp_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] -GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+1)*n_vect+j+jj];
+                               prp_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] -GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void prn_X_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[0] = l_pt[0] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO];
     prn_pt[1] = l_pt[1] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+1];
     prn_pt[2] = l_pt[2] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO+2];
@@ -312,7 +509,17 @@
     prn_pt[5] = l_pt[5] +GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO+2];
   }
 
-  static inline void pbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void prn_X_PRECISION_new( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, prn_pt[0*n_vect+j+jj] = l_pt[0*n_vect+j+jj] +GAMMA_X_SPIN0_VAL*l_pt[3*GAMMA_X_SPIN0_CO*n_vect+j+jj];
+                               prn_pt[1*n_vect+j+jj] = l_pt[1*n_vect+j+jj] +GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+1)*n_vect+j+jj];
+                               prn_pt[2*n_vect+j+jj] = l_pt[2*n_vect+j+jj] +GAMMA_X_SPIN0_VAL*l_pt[(3*GAMMA_X_SPIN0_CO+2)*n_vect+j+jj];
+                               prn_pt[3*n_vect+j+jj] = l_pt[3*n_vect+j+jj] +GAMMA_X_SPIN1_VAL*l_pt[3*GAMMA_X_SPIN1_CO*n_vect+j+jj];
+                               prn_pt[4*n_vect+j+jj] = l_pt[4*n_vect+j+jj] +GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+1)*n_vect+j+jj];
+                               prn_pt[5*n_vect+j+jj] = l_pt[5*n_vect+j+jj] +GAMMA_X_SPIN1_VAL*l_pt[(3*GAMMA_X_SPIN1_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void pbp_su3_X_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[0];
     l_pt[ 1] -= prp_su3_pt[1];
     l_pt[ 2] -= prp_su3_pt[2];
@@ -327,7 +534,23 @@
     l_pt[11] += GAMMA_X_SPIN3_VAL*prp_su3_pt[3*GAMMA_X_SPIN3_CO+2];
   }
 
-  static inline void pbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void pbp_su3_X_PRECISION_new( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prp_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prp_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prp_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prp_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prp_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prp_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] += GAMMA_X_SPIN2_VAL*prp_su3_pt[3*GAMMA_X_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] += GAMMA_X_SPIN2_VAL*prp_su3_pt[(3*GAMMA_X_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] += GAMMA_X_SPIN2_VAL*prp_su3_pt[(3*GAMMA_X_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] += GAMMA_X_SPIN3_VAL*prp_su3_pt[3*GAMMA_X_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] += GAMMA_X_SPIN3_VAL*prp_su3_pt[(3*GAMMA_X_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] += GAMMA_X_SPIN3_VAL*prp_su3_pt[(3*GAMMA_X_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
+  static inline void pbn_su3_X_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[0];
     l_pt[ 1] -= prn_su3_pt[1];
     l_pt[ 2] -= prn_su3_pt[2];
@@ -342,6 +565,22 @@
     l_pt[11] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[3*GAMMA_X_SPIN3_CO+2];
   }
 
+  static inline void pbn_su3_X_PRECISION_new( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, l_pt[ 0*n_vect+j+jj] -= prn_su3_pt[0*n_vect+j+jj];
+                               l_pt[ 1*n_vect+j+jj] -= prn_su3_pt[1*n_vect+j+jj];
+                               l_pt[ 2*n_vect+j+jj] -= prn_su3_pt[2*n_vect+j+jj];
+                               l_pt[ 3*n_vect+j+jj] -= prn_su3_pt[3*n_vect+j+jj];
+                               l_pt[ 4*n_vect+j+jj] -= prn_su3_pt[4*n_vect+j+jj];
+                               l_pt[ 5*n_vect+j+jj] -= prn_su3_pt[5*n_vect+j+jj];
+                               l_pt[ 6*n_vect+j+jj] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[3*GAMMA_X_SPIN2_CO*n_vect+j+jj];
+                               l_pt[ 7*n_vect+j+jj] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[(3*GAMMA_X_SPIN2_CO+1)*n_vect+j+jj];
+                               l_pt[ 8*n_vect+j+jj] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[(3*GAMMA_X_SPIN2_CO+2)*n_vect+j+jj];
+                               l_pt[ 9*n_vect+j+jj] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[3*GAMMA_X_SPIN3_CO*n_vect+j+jj];
+                               l_pt[10*n_vect+j+jj] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[(3*GAMMA_X_SPIN3_CO+1)*n_vect+j+jj];
+                               l_pt[11*n_vect+j+jj] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[(3*GAMMA_X_SPIN3_CO+2)*n_vect+j+jj];)
+  }
+
 //START
 #ifdef HAVE_TM1p1
 
@@ -349,7 +588,7 @@
 #define flav_gamma(k) (3*(k)+6*((k)/2))
 
   // 1 - gamma_T
-  static inline void dprp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void dprp_T_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[ 0] = l_pt[ 0] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)];
     prp_pt[ 1] = l_pt[ 1] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1];
     prp_pt[ 2] = l_pt[ 2] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2];
@@ -365,7 +604,7 @@
   }
 
   // 1 + gamma_T
-  static inline void dprn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void dprn_T_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[ 0] = l_pt[ 0] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)];
     prn_pt[ 1] = l_pt[ 1] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1];
     prn_pt[ 2] = l_pt[ 2] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2];
@@ -381,7 +620,7 @@
   }
 
   // - (1 - gamma_T)
-  static inline void dpbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbp_su3_T_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[ 0];
     l_pt[ 1] -= prp_su3_pt[ 1];
     l_pt[ 2] -= prp_su3_pt[ 2];
@@ -409,7 +648,7 @@
   }
 
   // -(1 + gamma_T)
-  static inline void dpbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbn_su3_T_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[ 0];
     l_pt[ 1] -= prn_su3_pt[ 1];
     l_pt[ 2] -= prn_su3_pt[ 2];
@@ -438,7 +677,7 @@
 
 
   // 1 - gamma_Z
-  static inline void dprp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void dprp_Z_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[ 0] = l_pt[ 0] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)];
     prp_pt[ 1] = l_pt[ 1] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1];
     prp_pt[ 2] = l_pt[ 2] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2];
@@ -454,7 +693,7 @@
   }
 
   // 1 + gamma_Z
-  static inline void dprn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void dprn_Z_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[ 0] = l_pt[ 0] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)];
     prn_pt[ 1] = l_pt[ 1] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1];
     prn_pt[ 2] = l_pt[ 2] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2];
@@ -470,7 +709,7 @@
   }
 
   // - (1 - gamma_Z)
-  static inline void dpbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbp_su3_Z_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[ 0];
     l_pt[ 1] -= prp_su3_pt[ 1];
     l_pt[ 2] -= prp_su3_pt[ 2];
@@ -498,7 +737,7 @@
   }
 
   // -(1 + gamma_Z)
-  static inline void dpbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbn_su3_Z_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[ 0];
     l_pt[ 1] -= prn_su3_pt[ 1];
     l_pt[ 2] -= prn_su3_pt[ 2];
@@ -527,7 +766,7 @@
 
 
   // 1 - gamma_Y
-  static inline void dprp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void dprp_Y_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[ 0] = l_pt[ 0] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)];
     prp_pt[ 1] = l_pt[ 1] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1];
     prp_pt[ 2] = l_pt[ 2] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2];
@@ -543,7 +782,7 @@
   }
 
   // 1 + gamma_Y
-  static inline void dprn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void dprn_Y_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[ 0] = l_pt[ 0] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)];
     prn_pt[ 1] = l_pt[ 1] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1];
     prn_pt[ 2] = l_pt[ 2] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2];
@@ -559,7 +798,7 @@
   }
 
   // - (1 - gamma_Y)
-  static inline void dpbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbp_su3_Y_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[ 0];
     l_pt[ 1] -= prp_su3_pt[ 1];
     l_pt[ 2] -= prp_su3_pt[ 2];
@@ -587,7 +826,7 @@
   }
 
   // -(1 + gamma_Y)
-  static inline void dpbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbn_su3_Y_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[ 0];
     l_pt[ 1] -= prn_su3_pt[ 1];
     l_pt[ 2] -= prn_su3_pt[ 2];
@@ -616,7 +855,7 @@
 
 
   // 1 - gamma_X
-  static inline void dprp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+  static inline void dprp_X_PRECISION( const buffer_PRECISION prp_pt, const buffer_PRECISION l_pt ) {
     prp_pt[ 0] = l_pt[ 0] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)];
     prp_pt[ 1] = l_pt[ 1] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1];
     prp_pt[ 2] = l_pt[ 2] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2];
@@ -632,7 +871,7 @@
   }
 
   // 1 + gamma_X
-  static inline void dprn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+  static inline void dprn_X_PRECISION( const buffer_PRECISION prn_pt, const buffer_PRECISION l_pt ) {
     prn_pt[ 0] = l_pt[ 0] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)];
     prn_pt[ 1] = l_pt[ 1] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1];
     prn_pt[ 2] = l_pt[ 2] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2];
@@ -648,7 +887,7 @@
   }
 
   // - (1 - gamma_X)
-  static inline void dpbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbp_su3_X_PRECISION( const buffer_PRECISION prp_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prp_su3_pt[ 0];
     l_pt[ 1] -= prp_su3_pt[ 1];
     l_pt[ 2] -= prp_su3_pt[ 2];
@@ -676,7 +915,7 @@
   }
 
   // -(1 + gamma_X)
-  static inline void dpbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+  static inline void dpbn_su3_X_PRECISION( const buffer_PRECISION prn_su3_pt, const buffer_PRECISION l_pt ) {
     l_pt[ 0] -= prn_su3_pt[ 0];
     l_pt[ 1] -= prn_su3_pt[ 1];
     l_pt[ 2] -= prn_su3_pt[ 2];
@@ -706,7 +945,7 @@
 #endif
 //END
 
-  static inline void twospin_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_p_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -733,7 +972,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin2_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin2_p_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] = in[ 0];
     out_spin0and1[ 1] = in[ 1];
     out_spin0and1[ 2] = in[ 2];
@@ -760,7 +999,7 @@
     out_spin2and3[11] = in[11];
   }
 
-  static inline void twospin_n_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_n_T_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -787,7 +1026,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin_p_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_p_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -814,7 +1053,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin2_p_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin2_p_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] = in[ 0];
     out_spin0and1[ 1] = in[ 1];
     out_spin0and1[ 2] = in[ 2];
@@ -841,7 +1080,7 @@
     out_spin2and3[11] = in[11];
   }
 
-  static inline void twospin_n_Z_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_n_Z_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -868,7 +1107,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin_p_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_p_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -895,7 +1134,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin2_p_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin2_p_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] = in[ 0];
     out_spin0and1[ 1] = in[ 1];
     out_spin0and1[ 2] = in[ 2];
@@ -922,7 +1161,7 @@
     out_spin2and3[11] = in[11];
   }
 
-  static inline void twospin_n_Y_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_n_Y_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -949,7 +1188,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin_p_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_p_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -976,7 +1215,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void twospin2_p_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin2_p_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] = in[ 0];
     out_spin0and1[ 1] = in[ 1];
     out_spin0and1[ 2] = in[ 2];
@@ -1003,7 +1242,7 @@
     out_spin2and3[11] = in[11];
   }
 
-  static inline void twospin_n_X_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
+  static inline void twospin_n_X_PRECISION( const buffer_PRECISION out_spin0and1, const buffer_PRECISION out_spin2and3, const buffer_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
     out_spin0and1[ 2] -= in[ 2];
@@ -1030,7 +1269,7 @@
     out_spin2and3[11] -= in[11];
   }
 
-  static inline void doublet_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) {
+  static inline void doublet_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) {
     // diagonal
     eta[ 0] = clover[ 0]*phi[ 0];
     eta[ 1] = clover[ 1]*phi[ 1];
@@ -1182,7 +1421,7 @@
     eta[23] += conj_PRECISION(clover[41])*phi[22];
   }
 
-  static inline void spin0and1_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) {
+  static inline void spin0and1_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) {
     // diagonal
     eta[ 0] = clover[ 0]*phi[ 0];
     eta[ 1] = clover[ 1]*phi[ 1];
@@ -1229,7 +1468,7 @@
     eta[5] += conj_PRECISION(clover[26])*phi[4];
   }
 
-  static inline void spin2and3_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) {
+  static inline void spin2and3_site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) {
     // diagonal
     eta[ 0] = _COMPLEX_PRECISION_ZERO;
     eta[ 1] = _COMPLEX_PRECISION_ZERO;
@@ -1276,7 +1515,7 @@
     eta[11] += conj_PRECISION(clover[41])*phi[10];
   }
   
-  static inline void site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) {
+  static inline void site_clover_PRECISION( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) {
     // diagonal
     eta[ 0] = clover[ 0]*phi[ 0];
     eta[ 1] = clover[ 1]*phi[ 1];
@@ -1353,5 +1592,86 @@
     eta[11] += conj_PRECISION(clover[40])*phi[ 9];
     eta[11] += conj_PRECISION(clover[41])*phi[10];
   }
-  
+
+
+
+  static inline void site_clover_PRECISION_new( const buffer_PRECISION eta, const buffer_PRECISION phi, const config_PRECISION clover ) {
+    int n_vect = g.num_rhs_vect, j, jj;
+    VECTOR_LOOP(j, n_vect, jj, // diagonal
+                               eta[ 0*n_vect+j+jj] = clover[ 0]*phi[ 0*n_vect+j+jj];
+                               eta[ 1*n_vect+j+jj] = clover[ 1]*phi[ 1*n_vect+j+jj];
+                               eta[ 2*n_vect+j+jj] = clover[ 2]*phi[ 2*n_vect+j+jj];
+                               eta[ 3*n_vect+j+jj] = clover[ 3]*phi[ 3*n_vect+j+jj];
+                               eta[ 4*n_vect+j+jj] = clover[ 4]*phi[ 4*n_vect+j+jj];
+                               eta[ 5*n_vect+j+jj] = clover[ 5]*phi[ 5*n_vect+j+jj];
+                               eta[ 6*n_vect+j+jj] = clover[ 6]*phi[ 6*n_vect+j+jj];
+                               eta[ 7*n_vect+j+jj] = clover[ 7]*phi[ 7*n_vect+j+jj];
+                               eta[ 8*n_vect+j+jj] = clover[ 8]*phi[ 8*n_vect+j+jj];
+                               eta[ 9*n_vect+j+jj] = clover[ 9]*phi[ 9*n_vect+j+jj];
+                               eta[10*n_vect+j+jj] = clover[10]*phi[10*n_vect+j+jj];
+                               eta[11*n_vect+j+jj] = clover[11]*phi[11*n_vect+j+jj];
+                               // spin 0 and 1, row major
+                               eta[0*n_vect+j+jj] += clover[12]*phi[1*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += clover[13]*phi[2*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += clover[14]*phi[3*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += clover[15]*phi[4*n_vect+j+jj];
+                               eta[0*n_vect+j+jj] += clover[16]*phi[5*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += clover[17]*phi[2*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += clover[18]*phi[3*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += clover[19]*phi[4*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += clover[20]*phi[5*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += clover[21]*phi[3*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += clover[22]*phi[4*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += clover[23]*phi[5*n_vect+j+jj];
+                               eta[3*n_vect+j+jj] += clover[24]*phi[4*n_vect+j+jj];
+                               eta[3*n_vect+j+jj] += clover[25]*phi[5*n_vect+j+jj];
+                               eta[4*n_vect+j+jj] += clover[26]*phi[5*n_vect+j+jj];
+                               eta[1*n_vect+j+jj] += conj_PRECISION(clover[12])*phi[0*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += conj_PRECISION(clover[13])*phi[0*n_vect+j+jj];
+                               eta[3*n_vect+j+jj] += conj_PRECISION(clover[14])*phi[0*n_vect+j+jj];
+                               eta[4*n_vect+j+jj] += conj_PRECISION(clover[15])*phi[0*n_vect+j+jj];
+                               eta[5*n_vect+j+jj] += conj_PRECISION(clover[16])*phi[0*n_vect+j+jj];
+                               eta[2*n_vect+j+jj] += conj_PRECISION(clover[17])*phi[1*n_vect+j+jj];
+                               eta[3*n_vect+j+jj] += conj_PRECISION(clover[18])*phi[1*n_vect+j+jj];
+                               eta[4*n_vect+j+jj] += conj_PRECISION(clover[19])*phi[1*n_vect+j+jj];
+                               eta[5*n_vect+j+jj] += conj_PRECISION(clover[20])*phi[1*n_vect+j+jj];
+                               eta[3*n_vect+j+jj] += conj_PRECISION(clover[21])*phi[2*n_vect+j+jj];
+                               eta[4*n_vect+j+jj] += conj_PRECISION(clover[22])*phi[2*n_vect+j+jj];
+                               eta[5*n_vect+j+jj] += conj_PRECISION(clover[23])*phi[2*n_vect+j+jj];
+                               eta[4*n_vect+j+jj] += conj_PRECISION(clover[24])*phi[3*n_vect+j+jj];
+                               eta[5*n_vect+j+jj] += conj_PRECISION(clover[25])*phi[3*n_vect+j+jj];
+                               eta[5*n_vect+j+jj] += conj_PRECISION(clover[26])*phi[4*n_vect+j+jj];
+                               // spin 2 and 3, row major
+                               eta[ 6*n_vect+j+jj] += clover[27]*phi[ 7*n_vect+j+jj];
+                               eta[ 6*n_vect+j+jj] += clover[28]*phi[ 8*n_vect+j+jj];
+                               eta[ 6*n_vect+j+jj] += clover[29]*phi[ 9*n_vect+j+jj];
+                               eta[ 6*n_vect+j+jj] += clover[30]*phi[10*n_vect+j+jj];
+                               eta[ 6*n_vect+j+jj] += clover[31]*phi[11*n_vect+j+jj];
+                               eta[ 7*n_vect+j+jj] += clover[32]*phi[ 8*n_vect+j+jj];
+                               eta[ 7*n_vect+j+jj] += clover[33]*phi[ 9*n_vect+j+jj];
+                               eta[ 7*n_vect+j+jj] += clover[34]*phi[10*n_vect+j+jj];
+                               eta[ 7*n_vect+j+jj] += clover[35]*phi[11*n_vect+j+jj];
+                               eta[ 8*n_vect+j+jj] += clover[36]*phi[ 9*n_vect+j+jj];
+                               eta[ 8*n_vect+j+jj] += clover[37]*phi[10*n_vect+j+jj];
+                               eta[ 8*n_vect+j+jj] += clover[38]*phi[11*n_vect+j+jj];
+                               eta[ 9*n_vect+j+jj] += clover[39]*phi[10*n_vect+j+jj];
+                               eta[ 9*n_vect+j+jj] += clover[40]*phi[11*n_vect+j+jj];
+                               eta[10*n_vect+j+jj] += clover[41]*phi[11*n_vect+j+jj];
+                               eta[ 7*n_vect+j+jj] += conj_PRECISION(clover[27])*phi[ 6*n_vect+j+jj];
+                               eta[ 8*n_vect+j+jj] += conj_PRECISION(clover[28])*phi[ 6*n_vect+j+jj];
+                               eta[ 9*n_vect+j+jj] += conj_PRECISION(clover[29])*phi[ 6*n_vect+j+jj];
+                               eta[10*n_vect+j+jj] += conj_PRECISION(clover[30])*phi[ 6*n_vect+j+jj];
+                               eta[11*n_vect+j+jj] += conj_PRECISION(clover[31])*phi[ 6*n_vect+j+jj];
+                               eta[ 8*n_vect+j+jj] += conj_PRECISION(clover[32])*phi[ 7*n_vect+j+jj];
+                               eta[ 9*n_vect+j+jj] += conj_PRECISION(clover[33])*phi[ 7*n_vect+j+jj];
+                               eta[10*n_vect+j+jj] += conj_PRECISION(clover[34])*phi[ 7*n_vect+j+jj];
+                               eta[11*n_vect+j+jj] += conj_PRECISION(clover[35])*phi[ 7*n_vect+j+jj];
+                               eta[ 9*n_vect+j+jj] += conj_PRECISION(clover[36])*phi[ 8*n_vect+j+jj];
+                               eta[10*n_vect+j+jj] += conj_PRECISION(clover[37])*phi[ 8*n_vect+j+jj];
+                               eta[11*n_vect+j+jj] += conj_PRECISION(clover[38])*phi[ 8*n_vect+j+jj];
+                               eta[10*n_vect+j+jj] += conj_PRECISION(clover[39])*phi[ 9*n_vect+j+jj];
+                               eta[11*n_vect+j+jj] += conj_PRECISION(clover[40])*phi[ 9*n_vect+j+jj];
+                               eta[11*n_vect+j+jj] += conj_PRECISION(clover[41])*phi[10*n_vect+j+jj];)
+  }
+ 
 #endif 
diff --git a/src/gathering_generic.c b/src/gathering_generic.c
index 2eb10fc..fbf0445 100644
--- a/src/gathering_generic.c
+++ b/src/gathering_generic.c
@@ -28,8 +28,8 @@ void gathering_PRECISION_next_level_init( gathering_PRECISION_struct *gs, level_
   gs->permutation = NULL;
   gs->gather_list = NULL;
   gs->reqs = NULL;
-  gs->buffer = NULL;
-  gs->transfer_buffer = NULL;
+  vector_PRECISION_init(&(gs->buffer));
+  vector_PRECISION_init(&(gs->transfer_buffer));
   
   gs->dist_inner_lattice_sites = 1;
   gs->gather_list_length = 1;
@@ -49,9 +49,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
        process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL;
   MALLOC( process_list, int, l->num_processes );
 #ifdef HAVE_TM1p1
-  MALLOC( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+  MALLOC( gs->transfer_buffer.vector_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var );
 #else
-  MALLOC( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+  MALLOC( gs->transfer_buffer.vector_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var );
 #endif  
 
   l->idle = 0;
@@ -96,9 +96,9 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
     MALLOC( gs->permutation, int, l->num_inner_lattice_sites );
     MALLOC( gs->reqs, MPI_Request, gs->gather_list_length );
 #ifdef HAVE_TM1p1
-    MALLOC( gs->buffer, complex_PRECISION, 2*l->inner_vector_size );
+    vector_PRECISION_alloc( &(gs->buffer), _INNER, 2, l, no_threading );
 #else
-    MALLOC( gs->buffer, complex_PRECISION, l->inner_vector_size );
+    vector_PRECISION_alloc( &(gs->buffer), _INNER, 1, l, no_threading );
 #endif
     MALLOC( field1, int, l->num_inner_lattice_sites );
     MALLOC( field2, int, l->num_inner_lattice_sites );
@@ -212,19 +212,15 @@ void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l )
     FREE( gs->gather_list, int, gs->gather_list_length );
     FREE( gs->permutation, int, l->num_inner_lattice_sites );
     FREE( gs->reqs, MPI_Request, gs->gather_list_length );
-#ifdef HAVE_TM1p1
-    FREE( gs->buffer, complex_PRECISION, 2*l->inner_vector_size );
-#else
-    FREE( gs->buffer, complex_PRECISION, l->inner_vector_size );
-#endif
+    vector_PRECISION_free( &(gs->buffer), l, no_threading );
   }
   
   MPI_Comm_free( &(gs->level_comm) );
   MPI_Group_free( &(gs->level_comm_group) );
 #ifdef HAVE_TM1p1
-  FREE( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+  FREE( gs->transfer_buffer.vector_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var );
 #else
-  FREE( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+  FREE( gs->transfer_buffer.vector_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var );
 #endif
 }
 
@@ -270,17 +266,17 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s
   } else {
     int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites,
         t, *pi = l->gs_PRECISION.permutation;
-    vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL;
+    buffer_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL;
     MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL, *odd_proj_reqs = NULL;
-    
+
 #ifdef HAVE_TM1p1
-    vector_PRECISION buffer_eps_term = NULL;
+    buffer_PRECISION buffer_eps_term = NULL;
     MPI_Request *eps_term_reqs = NULL;
     MALLOC( buffer_eps_term, complex_PRECISION, n*send_size_block );
     MALLOC( eps_term_reqs, MPI_Request, n );
 #endif
 #ifdef HAVE_TM
-    vector_PRECISION buffer_tm_term = NULL;
+    buffer_PRECISION buffer_tm_term = NULL;
     MPI_Request *tm_term_reqs = NULL;
     MALLOC( buffer_tm_term, complex_PRECISION, n*send_size_block );
     MALLOC( tm_term_reqs, MPI_Request, n );
@@ -408,12 +404,12 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s
     l->dummy_p_PRECISION.eval_operator = apply_coarse_operator_PRECISION;
 }
 
-void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, level_struct *l ) {
+void vector_PRECISION_gather( vector_PRECISION *gath, vector_PRECISION *dist, level_struct *l ) {
   
   int send_size = l->gs_PRECISION.dist_inner_lattice_sites * l->num_lattice_site_var;
   
   if ( g.my_rank != l->parent_rank ) {
-    MPI_Send( dist, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart );
+    MPI_Send( dist->vector_buffer, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart );
   } else {
     int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites,
         t=l->num_lattice_site_var, *pi = l->gs_PRECISION.permutation;
@@ -421,12 +417,12 @@ void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, leve
 
     PROF_PRECISION_START( _GD_COMM );
     for ( i=1; i<n; i++ )
-      MPI_Irecv( buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i],
+      MPI_Irecv( buffer.vector_buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i],
                  l->gs_PRECISION.gather_list[i], g.comm_cart, &(l->gs_PRECISION.reqs[i]) );
     PROF_PRECISION_STOP( _GD_COMM, n-1 );
 
     for ( i=0; i<send_size; i++ )
-      buffer[i] = dist[i];
+      buffer.vector_buffer[i] = dist->vector_buffer[i];
     
     PROF_PRECISION_START( _GD_IDLE );
     for ( i=1; i<n; i++ )
@@ -435,17 +431,17 @@ void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, leve
     // permute data according to desired data layout for parent process
     for ( i=0; i<s; i++ )
       for ( j=0; j<t; j++ )
-        gath[ t*pi[i] + j ] = buffer[ t*i + j ];
+        gath->vector_buffer[ t*pi[i] + j ] = buffer.vector_buffer[ t*i + j ];
   }  
 }
 
 
-void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath, level_struct *l ) {
+void vector_PRECISION_distribute( vector_PRECISION *dist, vector_PRECISION *gath, level_struct *l ) {
   
   int send_size = l->gs_PRECISION.dist_inner_lattice_sites * l->num_lattice_site_var;
   
   if ( g.my_rank != l->parent_rank ) {
-    MPI_Recv( dist, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart, MPI_STATUS_IGNORE );
+    MPI_Recv( dist->vector_buffer, send_size, MPI_COMPLEX_PRECISION, l->parent_rank, g.my_rank, g.comm_cart, MPI_STATUS_IGNORE );
   } else {
     int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites,
         t=l->num_lattice_site_var, *pi = l->gs_PRECISION.permutation;
@@ -453,16 +449,16 @@ void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath,
     // permute data according to desired distributed data layout
     for ( i=0; i<s; i++ )
       for ( j=0; j<t; j++ )
-        buffer[ t*i+j ] = gath[ t*pi[i]+j ];
+        buffer.vector_buffer[ t*i+j ] = gath->vector_buffer[ t*pi[i]+j ];
     
     PROF_PRECISION_START( _GD_COMM );
     for ( i=1; i<n; i++ )
-      MPI_Isend( buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 
+      MPI_Isend( buffer.vector_buffer+i*send_size, send_size, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 
                  l->gs_PRECISION.gather_list[i], g.comm_cart, &(l->gs_PRECISION.reqs[i]) );
     PROF_PRECISION_STOP( _GD_COMM, n-1 );
       
     for ( i=0; i<send_size; i++ )
-      dist[i] = buffer[i];
+      dist->vector_buffer[i] = buffer.vector_buffer[i];
     
     PROF_PRECISION_START( _GD_IDLE );
     for ( i=1; i<n; i++ )
diff --git a/src/gathering_generic.h b/src/gathering_generic.h
index 272bef0..8e9b158 100644
--- a/src/gathering_generic.h
+++ b/src/gathering_generic.h
@@ -27,8 +27,8 @@
   void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l );
   
   void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_struct *in, level_struct *l );
-  void vector_PRECISION_gather( vector_PRECISION gath, vector_PRECISION dist, level_struct *l );
-  void vector_PRECISION_distribute( vector_PRECISION dist, vector_PRECISION gath, level_struct *l );
+  void vector_PRECISION_gather( vector_PRECISION *gath, vector_PRECISION *dist, level_struct *l );
+  void vector_PRECISION_distribute( vector_PRECISION *dist, vector_PRECISION *gath, level_struct *l );
   
   void distribution_PRECISION_next_level_test( level_struct *l );
   
diff --git a/src/ghost_generic.c b/src/ghost_generic.c
index 5a423a0..528688d 100644
--- a/src/ghost_generic.c
+++ b/src/ghost_generic.c
@@ -21,7 +21,7 @@
 
 #include "main.h"
 
-void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECISION_struct *c, level_struct *l ) {
+void negative_sendrecv_PRECISION( vector_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l ) {
   // send dir = -1
   if( l->global_splitting[mu] > 1 ) {    
     
@@ -34,49 +34,18 @@ void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECI
     for ( i=0; i<mu; i++ )
       boundary_start += c->num_boundary_sites[2*i];
 
-    buffer = l->vbuf_PRECISION[8]+n*(boundary_start-l->num_inner_lattice_sites);
-    buffer_pt = buffer;
+    buffer.vector_buffer  = l->vbuf_PRECISION[8].vector_buffer+n*(boundary_start-l->num_inner_lattice_sites);
+    buffer_pt.vector_buffer = buffer.vector_buffer;
     
     for ( i=0; i<num_boundary_sites; i++ ) {
-      tmp_pt = phi + n*boundary_table[i];
-      for ( j=0; j<n; j++, buffer_pt++, tmp_pt++ )
-        *buffer_pt = *tmp_pt;
+      tmp_pt.vector_buffer = phi->vector_buffer + n*boundary_table[i];
+      for ( j=0; j<n; j++, buffer_pt.vector_buffer++, tmp_pt.vector_buffer++ )
+        *buffer_pt.vector_buffer = *tmp_pt.vector_buffer;
     }
     
-    MPI_Irecv( phi+n*boundary_start, n*num_boundary_sites, MPI_COMPLEX_PRECISION,
+    MPI_Irecv( phi->vector_buffer+n*boundary_start, n*num_boundary_sites, MPI_COMPLEX_PRECISION,
                l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) );
-    MPI_Isend( buffer, n*num_boundary_sites, MPI_COMPLEX_PRECISION,
-               l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) );
-  }
-}
-
-
-void negative_sendrecv_PRECISION_vectorized( complex_PRECISION *phi, const int mu, comm_PRECISION_struct *c,
-                                             level_struct *l, int count, complex_PRECISION *buffer ) {
-  // send dir = -1
-  if( l->global_splitting[mu] > 1 ) {
-
-    int i, j, num_boundary_sites = c->num_boundary_sites[2*mu+1], boundary_start,
-        *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var;
-
-    complex_PRECISION *tmp_pt;
-    complex_PRECISION *buffer_pt;
-
-    boundary_start = l->num_inner_lattice_sites;
-    for ( i=0; i<mu; i++ )
-      boundary_start += c->num_boundary_sites[2*i];
-
-    buffer_pt = buffer;
-
-    for ( i=0; i<num_boundary_sites; i++ ) {
-      tmp_pt = phi + count*n*boundary_table[i];
-      for ( j=0; j<count*n; j++, buffer_pt++, tmp_pt++ )
-        *buffer_pt = *tmp_pt;
-    }
-
-    MPI_Irecv( phi+count*n*boundary_start, count*n*num_boundary_sites, MPI_COMPLEX_PRECISION,
-               l->neighbor_rank[2*mu], 2*mu+1, g.comm_cart, &(c->rreqs[2*mu+1]) );
-    MPI_Isend( buffer, count*n*num_boundary_sites, MPI_COMPLEX_PRECISION,
+    MPI_Isend( buffer.vector_buffer, n*num_boundary_sites, MPI_COMPLEX_PRECISION,
                l->neighbor_rank[2*mu+1], 2*mu+1, g.comm_cart, &(c->sreqs[2*mu+1]) );
   }
 }
@@ -89,8 +58,6 @@ void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_stru
     MPI_Wait( &(c->rreqs[2*mu+1]), MPI_STATUS_IGNORE );
   }
 }
-
-
 void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l ) {
   
   int mu, nu, factor=1;
@@ -141,12 +108,13 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str
 #endif
     }
   }
-  
-  if ( l->vbuf_PRECISION[8] == NULL ) {
+ if ( l->vbuf_PRECISION[8].vector_buffer == NULL ) {
 #ifdef HAVE_TM1p1
-    MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size );
+    //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 2, l, no_threading);
+    MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size );
 #else
-    MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size );
+    //vector_PRECISION_alloc( &(l->vbuf_PRECISION[8]), _ORDINARY, 1, l, no_threading);
+    MALLOC( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size );
 #endif
   }
 }
@@ -160,14 +128,14 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) {
     FREE( c->buffer[2*mu], complex_PRECISION, c->max_length[mu] );
     FREE( c->buffer[2*mu+1], complex_PRECISION, c->max_length[mu] );
   }
-  
-  if ( l->vbuf_PRECISION[8] != NULL ) {
-#ifdef HAVE_TM1p1
-    FREE( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size );
-#else
-    FREE( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size );
-#endif
-  }
+  if ( l->vbuf_PRECISION[8].vector_buffer != NULL ){
+ //   vector_PRECISION_free( &(l->vbuf_PRECISION[8]), l, no_threading);
+#ifdef HAVE_TM1p1		
+     FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, 2*l->vector_size );		
+ #else		
+     FREE( l->vbuf_PRECISION[8].vector_buffer, complex_PRECISION, l->vector_size );		
+ #endif		
+   }
 }
 
 
@@ -185,14 +153,14 @@ void ghost_sendrecv_init_PRECISION( const int type, comm_PRECISION_struct *c, le
 }
 
 
-void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir,
+void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir,
                                comm_PRECISION_struct *c, const int amount, level_struct *l ) {
   // does not allow sending in both directions at the same time
   if( l->global_splitting[mu] > 1 ) {
     
     int i, j, *table=NULL, mu_dir = 2*mu-MIN(dir,0), offset = c->offset,
         length[2] = {0,0}, comm_start = 0, table_start = 0;
-    vector_PRECISION buffer, phi_pt;
+    buffer_PRECISION buffer, phi_pt;
     
     if ( amount == _FULL_SYSTEM ) {
       length[0] = (c->num_boundary_sites[2*mu])*offset;
@@ -229,7 +197,7 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir
       ghost_alloc_PRECISION( MAX(length[0],length[1]), c, l );
     }
     
-    buffer = (vector_PRECISION)c->buffer[mu_dir];
+    buffer = c->buffer[mu_dir];
     
     // dir = senddir
     if ( dir == 1 ) {
@@ -268,7 +236,7 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir
         buffer += offset;
       }
       
-      buffer = (vector_PRECISION)c->buffer[mu_dir];      
+      buffer = c->buffer[mu_dir];      
       phi_pt = phi + comm_start;
       
       if ( length[0] > 0 ) {
@@ -289,13 +257,13 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir
 }
 
 
-void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir,
+void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir,
                            comm_PRECISION_struct *c, const int amount, level_struct *l ) {
   
   if( l->global_splitting[mu] > 1 ) {    
     int mu_dir = 2*mu-MIN(dir,0);
     int i, j, *table, offset = c->offset, length[2]={0,0}, table_start = 0;
-    vector_PRECISION buffer, phi_pt;
+    buffer_PRECISION buffer, phi_pt;
 
 #ifdef HAVE_TM1p1
     if ( g.n_flavours == 2 )
@@ -322,7 +290,7 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir,
       
       int num_boundary_sites = length[0]/offset;
       
-      buffer = (vector_PRECISION)c->buffer[mu_dir];      
+      buffer = c->buffer[mu_dir];      
       table = c->boundary_table[2*mu+1] + table_start;
       
       if ( length[0] > 0 ) {
@@ -375,17 +343,17 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir,
 }
 
 
-void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) {
+void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) {
   
   if( l->global_splitting[mu] > 1 ) {
     int i, j, mu_dir = 2*mu-MIN(dir,0), nu, inv_mu_dir = 2*mu+1+MIN(dir,0), length, *table=NULL,
         comm_start, num_boundary_sites, site_var;
-    vector_PRECISION buffer, recv_pt, phi_pt;
+    buffer_PRECISION buffer, recv_pt, phi_pt;
     
     site_var = l->num_lattice_site_var;
     length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var;
     num_boundary_sites = c->num_boundary_sites[mu_dir];
-    buffer = (vector_PRECISION)c->buffer[mu_dir];
+    buffer = c->buffer[mu_dir];
     
     if ( dir == -1 )
       comm_start = l->vector_size;
@@ -398,7 +366,7 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir,
     ASSERT( c->in_use[mu_dir] == 0 );
     c->in_use[mu_dir] = 1;
     
-    recv_pt = phi + comm_start;
+    recv_pt = phi->vector_buffer + comm_start;
     if ( length > 0 ) {
       PROF_PRECISION_START( _OP_COMM );
       MPI_Irecv( recv_pt, length, MPI_COMPLEX_PRECISION,
@@ -408,14 +376,14 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir,
     
     table = c->boundary_table[inv_mu_dir];
     for ( j=0; j<num_boundary_sites; j++ ) {
-      phi_pt = phi + table[j]*site_var;
+      phi_pt = phi->vector_buffer + table[j]*site_var;
       
       for ( i=0; i<site_var; i++ ) {
         buffer[i] = phi_pt[i];
       }
       buffer += site_var;
     }
-    buffer = (vector_PRECISION)c->buffer[mu_dir];
+    buffer = c->buffer[mu_dir];
     
     if ( length > 0 ) {
       PROF_PRECISION_START( _OP_COMM );
@@ -427,7 +395,7 @@ void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir,
 }
 
 
-void ghost_update_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) {
+void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l ) {
   
   if( l->global_splitting[mu] > 1 ) {
     int mu_dir = 2*mu-MIN(dir,0), length = c->num_boundary_sites[mu_dir]*l->num_lattice_site_var;
diff --git a/src/ghost_generic.h b/src/ghost_generic.h
index 7b5b019..59a583c 100644
--- a/src/ghost_generic.h
+++ b/src/ghost_generic.h
@@ -22,22 +22,18 @@
 #ifndef GHOST_PRECISION_HEADER
   #define GHOST_PRECISION_HEADER
     
-  void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECISION_struct *c, level_struct *l );
-  
-  // as negative_sendrecv_PRECISION, but for count vectors stored in phi in vector-fused data layout
-  // buffer must be big enough to hold the surface data for count vectors (in one direction)
-  void negative_sendrecv_PRECISION_vectorized( complex_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l, int count, complex_PRECISION *buffer );
+  void negative_sendrecv_PRECISION( vector_PRECISION *phi, const int mu, comm_PRECISION_struct *c, level_struct *l );
   void negative_wait_PRECISION( const int mu, comm_PRECISION_struct *c, level_struct *l );
   
   void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_struct *l );
   void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l );
   void ghost_sendrecv_init_PRECISION( const int type, comm_PRECISION_struct *c, level_struct *l );
-  void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir,
+  void ghost_sendrecv_PRECISION( buffer_PRECISION phi, const int mu, const int dir,
                                  comm_PRECISION_struct *c, const int amount, level_struct *l );
-  void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir,
+  void ghost_wait_PRECISION( buffer_PRECISION phi, const int mu, const int dir,
                              comm_PRECISION_struct *c, const int amount, level_struct *l );
   
-  void ghost_update_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l );
-  void ghost_update_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l );
+  void ghost_update_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l );
+  void ghost_update_wait_PRECISION( vector_PRECISION *phi, const int mu, const int dir, comm_PRECISION_struct *c, level_struct *l );
 
 #endif
diff --git a/src/init.c b/src/init.c
index cd83ce4..614c515 100644
--- a/src/init.c
+++ b/src/init.c
@@ -152,22 +152,26 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
 #ifdef INIT_ONE_PREC
     if ( g.mixed_precision == 2 ) {
 #endif
-      fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size,
+      fgmres_MP_struct_alloc( g.restart, g.max_restart, _INNER,
                               g.tol, _RIGHT, vcycle_float, &(g.p_MP), l );
       g.p.op = &(g.op_double);
 #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS))
 #ifdef HAVE_TM1p1
-      MALLOC( g.p.b, complex_double, 2*l->inner_vector_size );
-      MALLOC( g.p.x, complex_double, 2*l->inner_vector_size );
+      vector_double_alloc( &(g.p.b), _INNER, 2*g.num_rhs_vect, l, no_threading );
+      vector_double_alloc( &(g.p.x), _INNER, 2*g.num_rhs_vect, l, no_threading );
+      //MALLOC( g.p.b, complex_double, 2*l->inner_vector_size );
+      //MALLOC( g.p.x, complex_double, 2*l->inner_vector_size );
 #else
-      MALLOC( g.p.b, complex_double, l->inner_vector_size );
-      MALLOC( g.p.x, complex_double, l->inner_vector_size );
+      vector_double_alloc( &(g.p.b), _INNER, g.num_rhs_vect, l, no_threading );
+      vector_double_alloc( &(g.p.x), _INNER, g.num_rhs_vect, l, no_threading );
+      //MALLOC( g.p.b, complex_double, l->inner_vector_size );
+      //MALLOC( g.p.x, complex_double, l->inner_vector_size );
 #endif
 #endif
 #ifdef INIT_ONE_PREC
     } else {
 #endif
-      fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, g.tol,
+      fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol,
                                   _GLOBAL_FGMRES, _RIGHT, preconditioner,
                                   g.method==6?g5D_plus_clover_double:d_plus_clover_double, &(g.p), l );
     }
@@ -178,29 +182,36 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
 #ifdef INIT_ONE_PREC
     if ( g.mixed_precision == 2 ) {
 #endif
-      fgmres_MP_struct_alloc( g.restart, g.max_restart, l->inner_vector_size,
+      fgmres_MP_struct_alloc( g.restart, g.max_restart, _INNER,
                               g.tol, _NOTHING, NULL, &(g.p_MP), l );
       g.p.op = &(g.op_double);
 #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS))
 #ifdef HAVE_TM1p1
-      MALLOC( g.p.b, complex_double, 2*l->inner_vector_size );
-      MALLOC( g.p.x, complex_double, 2*l->inner_vector_size );
+      vector_double_alloc( &(g.p.b), _INNER, 2*g.num_rhs_vect, l, no_threading );
+      vector_double_alloc( &(g.p.x), _INNER, 2*g.num_rhs_vect, l, no_threading );
+      //MALLOC( g.p.b, complex_double, 2*l->inner_vector_size );
+      //MALLOC( g.p.x, complex_double, 2*l->inner_vector_size );
 #else
-      MALLOC( g.p.b, complex_double, l->inner_vector_size );
-      MALLOC( g.p.x, complex_double, l->inner_vector_size );
+      vector_double_alloc( &(g.p.b), _INNER, g.num_rhs_vect, l, no_threading );
+      vector_double_alloc( &(g.p.x), _INNER, g.num_rhs_vect, l, no_threading );
+      //MALLOC( g.p.b, complex_double, l->inner_vector_size );
+      //MALLOC( g.p.x, complex_double, l->inner_vector_size );
 #endif
 #endif
 #ifdef INIT_ONE_PREC
     } else {
 #endif
-      fgmres_double_struct_alloc( g.restart, g.max_restart, l->inner_vector_size, g.tol,
+      /*fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol,
                                   _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double,
+                                  &(g.p), l );*/
+      fgmres_double_struct_alloc( g.restart, g.max_restart, _INNER, g.tol,
+                                  _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double_new,
                                   &(g.p), l );
 #ifdef INIT_ONE_PREC
     }
 #endif
   } else if ( g.method == -1 ) {
-    fgmres_double_struct_alloc( 4, g.restart*g.max_restart, l->inner_vector_size, g.tol,
+    fgmres_double_struct_alloc( 4, g.restart*g.max_restart, _INNER, g.tol,
                                 _GLOBAL_FGMRES, _NOTHING, NULL, d_plus_clover_double, &(g.p), l );
     fine_level_double_alloc( l );
   }
@@ -361,14 +372,18 @@ void method_free( level_struct *l ) {
 #ifdef INIT_ONE_PREC
   if ( g.mixed_precision == 2 && g.method >= 0 ) {
 #endif
-    fgmres_MP_struct_free( &(g.p_MP) );
+    fgmres_MP_struct_free( &(g.p_MP), l );
 #if defined (INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS))
 #ifdef HAVE_TM1p1
-    FREE( g.p.b, complex_double, 2*l->inner_vector_size );
-    FREE( g.p.x, complex_double, 2*l->inner_vector_size );
+    vector_double_free( &(g.p.b), l, no_threading );
+    vector_double_free( &(g.p.x), l, no_threading );
+    //FREE( g.p.b, complex_double, 2*l->inner_vector_size );
+    //FREE( g.p.x, complex_double, 2*l->inner_vector_size );
 #else
-    FREE( g.p.b, complex_double, l->inner_vector_size );
-    FREE( g.p.x, complex_double, l->inner_vector_size );
+    vector_double_free( &(g.p.b), l, no_threading );
+    vector_double_free( &(g.p.x), l, no_threading );
+    //FREE( g.p.b, complex_double, l->inner_vector_size );
+    //FREE( g.p.x, complex_double, l->inner_vector_size );
 #endif
 #endif
 #ifdef INIT_ONE_PREC
@@ -646,8 +661,8 @@ void l_init( level_struct *l ) {
 
   level_double_init( l );
   level_float_init( l );
-  
-  l->x = NULL;
+
+  vector_double_init(&(l->x));
   l->next_level = NULL;
   l->reqs = NULL;
 }
@@ -679,6 +694,7 @@ void g_init( level_struct *l ) {
   g.cur_storage = 0;
   g.max_storage = 0;
   g.in_setup = 0;
+  g.num_rhs_vect = 0;
 }
 
 void read_global_info( FILE *in ) {
@@ -1021,6 +1037,8 @@ void read_solver_parameters( FILE *in, level_struct *l ) {
   save_pt = &(g.downprop); g.downprop=1;
   read_parameter( &save_pt, "addDownPropagator:", "%d", 1, in, _DEFAULT_SET );
 #endif
+  save_pt = &(g.num_rhs_vect); g.num_rhs_vect=1;
+  read_parameter( &save_pt, "number of rhs vectors:", "%d", 1, in, _DEFAULT_SET );
   
   if ( g.randomize ) {
     srand( time( 0 ) + 1000*g.my_rank );
@@ -1085,13 +1103,6 @@ void validate_parameters( int ls, level_struct *l ) {
   int i;
   int mu;
 
-#ifdef SSE
-  if ( !g.odd_even )
-    warning0("The SSE implementation is based on the odd-even preconditioned code.\
-    \n         Switch on odd-even preconditioning in the input file.\n");
-  ASSERT( g.odd_even );
-#endif
-  
   if ( g.method == 5 && g.interpolation != 0 ) {
     warning0("Multigrid with BiCGstab smoothing is not supported.\n         Switching to FGMRES preconditioned with BiCGstab (g.interpolation=0).\n");
     g.interpolation = 0;
@@ -1115,14 +1126,6 @@ void validate_parameters( int ls, level_struct *l ) {
       ASSERT( DIVIDES( g.block_lattice[i][mu], g.local_lattice[i][mu] ) );
       ASSERT( DIVIDES( g.global_lattice[i][mu]/g.global_lattice[i+1][mu], g.local_lattice[i][mu] ) ); 
       ASSERT( DIVIDES( g.block_lattice[i][mu], g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ) );
-#ifdef SSE
-      if ( g.block_lattice[i][mu] != g.global_lattice[i][mu]/g.global_lattice[i+1][mu] )
-        warning0("when using SSE, Schwarz block size and aggregate size have to match.\n");
-      ASSERT( g.block_lattice[i][mu] == g.global_lattice[i][mu]/g.global_lattice[i+1][mu] );
-      // it works everywhere but we have some problem with the vector size.
-      // TODO: check all vectora allocated with size l->inner_vector_size
-      ASSERT( g.num_eig_vect[i] % SIMD_LENGTH_float == 0 );
-#endif
     }
     
   if ( g.odd_even ) {
@@ -1161,10 +1164,6 @@ void validate_parameters( int ls, level_struct *l ) {
 
   //LIST OF CASES WHICH SHOULD WORK, BUT DO NOT (TODO)
 
-#ifdef SSE
-  ASSERT( g.mixed_precision );
-#endif
-  
   //TODO: Could work without, but you need to fix the setup phase.    
   for ( i=0; i<g.num_levels-2; i++ )
     ASSERT( g.num_eig_vect[i] <= g.num_eig_vect[i+1] );
diff --git a/src/init_generic.c b/src/init_generic.c
index c4b284f..9c57566 100644
--- a/src/init_generic.c
+++ b/src/init_generic.c
@@ -52,6 +52,7 @@ void prof_PRECISION_init( level_struct *l ) {
     sprintf( l->prof_PRECISION.name[_GRAM_SCHMIDT], "Gram-Schmidt, PRECISION" );
     sprintf( l->prof_PRECISION.name[_GRAM_SCHMIDT_ON_AGGREGATES], "Gram-Schmidt on aggregates, PRECISION" );
     sprintf( l->prof_PRECISION.name[_CPY], "copy operations, PRECISION" );
+    sprintf( l->prof_PRECISION.name[_RS], "real scale operations, PRECISION" );
     sprintf( l->prof_PRECISION.name[_SET], "set value operations, PRECISION" );
     sprintf( l->prof_PRECISION.name[_PR], "interpolation and restriction, PRECISION" );
     l->prof_PRECISION.flop[_PR] = level_ratio*l->num_lattice_site_var*8.0*(l->num_lattice_site_var/2);
@@ -94,22 +95,21 @@ double prof_PRECISION_print( level_struct *l ) {
   return flop;
 }
 
-
 void fine_level_PRECISION_alloc( level_struct *l ) {
   
   int n = 8;
 #ifdef HAVE_TM1p1
-  MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size );  
-  for ( int i=1; i<n; i++ )
-    l->vbuf_PRECISION[i] = l->vbuf_PRECISION[0] + 2*i*l->vector_size;
-  MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size );
-  l->p_PRECISION.x = l->p_PRECISION.b + 2*l->inner_vector_size;
+  for ( int i=0; i<n; i++ ){
+    vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, 2*g.num_rhs_vect, l, no_threading );
+  }
+  vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, 2*g.num_rhs_vect, l, no_threading );
+  vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, 2*g.num_rhs_vect, l, no_threading ); 
 #else
-  MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size );  
-  for ( int i=1; i<n; i++ )
-    l->vbuf_PRECISION[i] = l->vbuf_PRECISION[0] + i*l->vector_size;
-  MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size );
-  l->p_PRECISION.x = l->p_PRECISION.b + l->inner_vector_size;
+  for ( int i=0; i<n; i++ ){
+    vector_PRECISION_alloc( &(l->vbuf_PRECISION[i]), _ORDINARY, g.num_rhs_vect, l, no_threading );
+  }
+  vector_PRECISION_alloc( &(l->p_PRECISION.b), _INNER, g.num_rhs_vect, l, no_threading );
+  vector_PRECISION_alloc( &(l->p_PRECISION.x), _INNER, g.num_rhs_vect, l, no_threading ); 
 #endif
 }
 
@@ -117,20 +117,10 @@ void fine_level_PRECISION_alloc( level_struct *l ) {
 void fine_level_PRECISION_free( level_struct *l ) {
   
   int n = 8;
-  
-#ifdef HAVE_TM1p1
-  FREE( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size );  
-  for ( int i=1; i<n; i++ )
-    l->vbuf_PRECISION[i] = NULL;
-  FREE( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size );
-  l->p_PRECISION.x = NULL;
-#else
-  FREE( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size );  
-  for ( int i=1; i<n; i++ )
-    l->vbuf_PRECISION[i] = NULL;
-  FREE( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size );
-  l->p_PRECISION.x = NULL;
-#endif
+  for ( int i=0; i<n; i++ )
+    vector_PRECISION_free( &(l->vbuf_PRECISION[i]), l, no_threading );
+  vector_PRECISION_free( &(l->p_PRECISION.b), l, no_threading );
+  vector_PRECISION_free( &(l->p_PRECISION.x), l, no_threading );
 }
 
 
@@ -146,24 +136,26 @@ void next_level_PRECISION_setup( level_struct *l ) {
     coarsening_index_table_PRECISION_define( &(l->is_PRECISION), &(l->s_PRECISION), l );
 
     if ( l->level == 1 && !l->next_level->idle ) {
-      fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, 
+      fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, _ORDINARY, g.coarse_tol, 
                                      _COARSE_GMRES, _NOTHING, NULL,
                                      g.method==6?(g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION)
                                      :(g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION),
                                      &(l->next_level->p_PRECISION), l->next_level );
     } else {
       if ( g.kcycle ) {
-        fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, l->next_level->vector_size, g.kcycle_tol, 
+        fgmres_PRECISION_struct_alloc( g.kcycle_restart, g.kcycle_max_restart, _ORDINARY, g.kcycle_tol, 
                                        _K_CYCLE, _RIGHT, vcycle_PRECISION,
                                        g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION,
                                        &(l->next_level->p_PRECISION), l->next_level );
       } else {
+        vector_PRECISION_init(&(l->next_level->p_PRECISION.b));
+        vector_PRECISION_init(&(l->next_level->p_PRECISION.x));
 #ifdef HAVE_TM1p1
-        MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size );
-        l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + 2*l->next_level->vector_size;
+        vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading );
+        vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading );
 #else
-        MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size );
-        l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + l->next_level->vector_size;
+        vector_PRECISION_alloc( &(l->next_level->p_PRECISION.b), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading );
+        vector_PRECISION_alloc( &(l->next_level->p_PRECISION.x), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading );
 #endif
         l->next_level->p_PRECISION.v_start = 0;
         l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size;
@@ -171,15 +163,13 @@ void next_level_PRECISION_setup( level_struct *l ) {
     }
 
     int i, n = (l->next_level->level>0)?6:4;
+    for ( i=0; i<n; i++ ){
 #ifdef HAVE_TM1p1
-    MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size );
-    for ( i=1; i<n; i++ )
-      l->next_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + 2*i*l->next_level->vector_size;
+      vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, 2*g.num_rhs_vect, l->next_level, no_threading );
 #else
-    MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size );
-    for ( i=1; i<n; i++ )
-      l->next_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + i*l->next_level->vector_size;
+      vector_PRECISION_alloc( &(l->next_level->vbuf_PRECISION[i]), _ORDINARY, g.num_rhs_vect, l->next_level, no_threading );
 #endif
+    }
   }
 }
 
@@ -192,21 +182,13 @@ void next_level_PRECISION_free( level_struct *l ) {
     if ( ( l->level == 1 && !l->next_level->idle ) || g.kcycle ) {
       fgmres_PRECISION_struct_free( &(l->next_level->p_PRECISION), l->next_level );
     } else {
-#ifdef HAVE_TM1p1
-      FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size );
-#else
-      FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size );
-#endif
+      vector_PRECISION_free( &(l->next_level->p_PRECISION.b), l->next_level, no_threading );
+      vector_PRECISION_free( &(l->next_level->p_PRECISION.x), l->next_level, no_threading );
     }
   
     int i, n = (l->next_level->level>0)?6:4;  
-    for ( i=1; i<n; i++)
-      l->next_level->vbuf_PRECISION[i] = NULL;
-#ifdef HAVE_TM1p1
-    FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size );
-#else
-    FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size );
-#endif
+    for ( i=0; i<n; i++)
+      vector_PRECISION_free( &(l->next_level->vbuf_PRECISION[i]), l->next_level, no_threading );
     coarsening_index_table_PRECISION_free( &(l->is_PRECISION), l );
   }
 
@@ -217,7 +199,7 @@ void next_level_PRECISION_free( level_struct *l ) {
 void level_PRECISION_init( level_struct *l ) {
 
   for ( int i=0; i<9; i++ )
-    l->vbuf_PRECISION[i] = NULL;
+    vector_PRECISION_init( &(l->vbuf_PRECISION[i]) );
   
   operator_PRECISION_init( &(l->op_PRECISION) );
   operator_PRECISION_init( &(l->oe_op_PRECISION) );
@@ -231,20 +213,23 @@ void level_PRECISION_init( level_struct *l ) {
 void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading ) {
   
   ASSERT( g.mixed_precision );
-  vector_PRECISION v1 = NULL, v2 = NULL;
+  vector_PRECISION v1, v2;
+  vector_PRECISION_init(&v1);
+  vector_PRECISION_init(&v2);
+  
   double t0=0, t1=0;
-  PUBLIC_MALLOC( v1, complex_PRECISION, l->inner_vector_size );
-  PUBLIC_MALLOC( v2, complex_PRECISION, l->inner_vector_size );
+  vector_PRECISION_alloc(&v1, _INNER, 1, l, threading);
+  vector_PRECISION_alloc(&v2, _INNER, 1, l, threading);
 
   START_LOCKED_MASTER(threading)
-  vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l );
+  vector_PRECISION_define_random( &v2, 0, l->inner_vector_size, l );
   END_LOCKED_MASTER(threading)
   
   START_MASTER(threading)
   t0 = MPI_Wtime();
   END_MASTER(threading)
   for ( int i=0; i<n; i++ ) {
-    vcycle_PRECISION( v1, NULL, v2, _NO_RES, l, threading );
+    vcycle_PRECISION( &v1, NULL, &v2, _NO_RES, l, threading );
   }
   START_MASTER(threading)
   t1 = MPI_Wtime();
@@ -252,7 +237,7 @@ void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading )
   END_MASTER(threading)
 
   START_LOCKED_MASTER(threading)
-  PUBLIC_FREE( v1, complex_PRECISION, l->inner_vector_size );
-  PUBLIC_FREE( v2, complex_PRECISION, l->inner_vector_size );
+  vector_PRECISION_free(&v1, l, threading);
+  vector_PRECISION_free(&v2, l, threading);
   END_LOCKED_MASTER(threading)
 }
diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c
index 8981bec..b879824 100644
--- a/src/interpolation_generic.c
+++ b/src/interpolation_generic.c
@@ -21,39 +21,34 @@
 
 #include "main.h"
 
-#if ( !defined( SSE ) || !defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) )
-
 void interpolation_PRECISION_alloc( level_struct *l ) {
   
   int k, n = l->num_eig_vect;
   
   MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
-  MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n );
-  MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n );
-  l->is_PRECISION.interpolation[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 64 );
-  for ( k=1; k<n; k++ )
-    l->is_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size;
-  MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size );
-  l->is_PRECISION.test_vector[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 64 );
-  for ( k=1; k<n; k++ ) {
-    l->is_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size;
+  MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, n );
+  MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, n );
+  for ( k=0; k<n; k++ ){
+    vector_PRECISION_init(&(l->is_PRECISION.interpolation[k]));
+    vector_PRECISION_alloc(&(l->is_PRECISION.interpolation[k]), _ORDINARY, 1, l, no_threading );
+    vector_PRECISION_init(&(l->is_PRECISION.test_vector[k]));
+    vector_PRECISION_alloc(&(l->is_PRECISION.test_vector[k]), _INNER, 1, l, no_threading );
   }
+  MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size );
 }
 
 
 void interpolation_PRECISION_dummy_alloc( level_struct *l ) {
   
-  MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect );
-  MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect );  
+  MALLOC( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect );
+  MALLOC( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect );  
 }
 
 
 void interpolation_PRECISION_dummy_free( level_struct *l ) {
   
-  FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect );
-  FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect );  
+  FREE( l->is_PRECISION.test_vector, vector_PRECISION, l->num_eig_vect );
+  FREE( l->is_PRECISION.interpolation, vector_PRECISION, l->num_eig_vect );  
 }
 
 
@@ -61,17 +56,19 @@ void interpolation_PRECISION_free( level_struct *l ) {
   
   int n = l->num_eig_vect;
   
-  FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size );
+  for (int k=0; k<n; k++ ){
+    vector_PRECISION_free(&(l->is_PRECISION.interpolation[k]), l, no_threading );
+    vector_PRECISION_free(&(l->is_PRECISION.test_vector[k]), l, no_threading );
+  }
   FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
-  FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n );
-  FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size );
-  FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n );
+  FREE( l->is_PRECISION.test_vector, vector_PRECISION, n );
+  FREE( l->is_PRECISION.interpolation, vector_PRECISION, n );
   FREE( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size );
 
 }
 
 
-void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) {
+void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading ) {
   
   int j, num_eig_vect = l->num_eig_vect;
   complex_PRECISION *operator = l->is_PRECISION.operator;
@@ -83,31 +80,31 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation,
   operator += start*num_eig_vect;
   for ( int i=start; i<end; i++ )
     for ( j=0; j<num_eig_vect; j++ ) {
-      *operator = interpolation[j][i];
+      *operator = interpolation[j].vector_buffer[i];
       operator++;
     }
   SYNC_CORES(threading)
 }
 
 
-void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
+void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _PR, threading );
   int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1,
       num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
+  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer,
+                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer;
                     
   START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
+  vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level );
   END_LOCKED_MASTER(threading)
   SYNC_HYPERTHREADS(threading)
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours==2 )
     for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      phi_pt   = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect;
       operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
       for ( k=0; k<aggregate_sites; k++ ) {
         for ( k1=0; k1<2; k1++ ) {
@@ -135,8 +132,8 @@ void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_
   else
 #endif  
     for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+      phi_pt   = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect;
       operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
       for ( k=0; k<aggregate_sites; k++ ) {
         for ( k1=0; k1<2; k1++ ) {
@@ -158,24 +155,24 @@ void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_
 }
 
 
-void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
+void interpolate3_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _PR, threading );
   int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
       num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
+  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer,
+                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer;
 
   START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
+  vector_PRECISION_distribute( &(l->next_level->gs_PRECISION.transfer_buffer), phi_c, l->next_level );
   END_LOCKED_MASTER(threading)
   SYNC_HYPERTHREADS(threading)
   
 #ifdef HAVE_TM1p1
   if( g.n_flavours==2 )
     for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      phi_pt   = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect;
       int sign = 1;
       operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
       for ( k=0; k<aggregate_sites; k++ ) {
@@ -208,8 +205,8 @@ void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level
   else
 #endif  
     for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+      phi_pt   = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect;
       int sign = 1;
       operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
       for ( k=0; k<aggregate_sites; k++ ) {
@@ -233,7 +230,7 @@ void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level
 }
 
 
-void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_struct *l, struct Thread *threading ) {
   
   SYNC_CORES(threading)
   SYNC_HYPERTHREADS(threading)
@@ -241,14 +238,14 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str
   PROF_PRECISION_START( _PR, threading );
   int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1,
       num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
+  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi->vector_buffer,
+                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer;
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours==2 )
     for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {   
-      phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      phi_pt   = phi->vector_buffer + i*2*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*2*num_eig_vect;
       operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
       
       for ( j=0; j<2*2*num_eig_vect; j++ )
@@ -280,8 +277,8 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str
   else
 #endif  
     for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {   
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+      phi_pt   = phi->vector_buffer + i*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer.vector_buffer + i*2*num_eig_vect;
       operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
       
       for ( j=0; j<2*num_eig_vect; j++ )
@@ -303,9 +300,8 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str
   
   SYNC_HYPERTHREADS(threading)
   START_LOCKED_MASTER(threading)
-  vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level );
+  vector_PRECISION_gather( phi_c, &(l->next_level->gs_PRECISION.transfer_buffer), l->next_level );
   END_LOCKED_MASTER(threading)
   PROF_PRECISION_STOP( _PR, 1, threading );
 }
 
-#endif
diff --git a/src/interpolation_generic.h b/src/interpolation_generic.h
index 97be6ec..43c65d2 100644
--- a/src/interpolation_generic.h
+++ b/src/interpolation_generic.h
@@ -29,10 +29,10 @@
   void interpolation_PRECISION_dummy_alloc( level_struct *l );
   void interpolation_PRECISION_dummy_free( level_struct *l );
   
-  void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading );
-  void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading );
-  void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void interpolate_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading );
+  void interpolate3_PRECISION( vector_PRECISION *phi, vector_PRECISION *phi_c, level_struct *l, struct Thread *threading );
+  void restrict_PRECISION( vector_PRECISION *phi_c, vector_PRECISION *phi, level_struct *l, struct Thread *threading );
   
-  void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading );
+  void define_interpolation_PRECISION_operator( vector_PRECISION *interpolation, level_struct *l, struct Thread *threading );
 #endif
 
diff --git a/src/io.c b/src/io.c
index 02b5ceb..fe9272a 100644
--- a/src/io.c
+++ b/src/io.c
@@ -716,8 +716,9 @@ void vector_io( double *phi, char *filename, const int mode, level_struct *l ) {
     FREE( buffer[0].data, double, bar_size );
     FREE( buffer[1].data, double, bar_size );
   }
-  
-  norm = global_norm_double( (vector_double)phi, 0, l->inner_vector_size, l, no_threading );
+  vector_double phi_vec;
+  phi_vec.vector_buffer = (buffer_double) phi; 
+  norm = global_norm_double( &phi_vec, 0, l->inner_vector_size, l, no_threading );
   printf0("norm: %e\n", norm );
   printf0("...done (%lf seconds)\n\n", t1-t0 ); 
 }
@@ -871,7 +872,7 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i
         ASSERT( fread( buffer_pt->data, sizeof(double), bar_size, file ) );
       }
 
-      phi=(double *) (l->x);
+      phi=(double *) (&(l->x));
       phi_pt=phi;
       for ( t=0; t<gl[T]; t++ )
         for ( z=0; z<gl[Z]; z++ )
@@ -908,11 +909,13 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i
             }
       if ( psi == NULL ) {
         if ( g.mixed_precision )
-          trans_float(l->is_float.test_vector[j], l->x, l->s_float.op.translation_table, l, no_threading);
+          trans_float(&(l->is_float.test_vector[j]), &(l->x), l->s_float.op.translation_table, l, no_threading);
         else
-          trans_double(l->is_double.test_vector[j], l->x, l->s_double.op.translation_table, l, no_threading);
+          trans_double(&(l->is_double.test_vector[j]), &(l->x), l->s_double.op.translation_table, l, no_threading);
       } else {
-        vector_double_copy( ((vector_double)psi)+j*l->inner_vector_size, l->x, 0, l->inner_vector_size, l );
+	vector_double psi_vec;
+	psi_vec.vector_buffer = ((buffer_double) psi) + j*l->inner_vector_size;
+        vector_double_copy( &psi_vec, &(l->x), 0, l->inner_vector_size, l );
       }
     }
   } else if ( mode == _WRITE ) {
@@ -927,13 +930,15 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i
     for ( j=0; j<n; j++ ){
       if ( psi == NULL ) {
         if ( g.mixed_precision )
-          trans_back_float( l->x, l->is_float.test_vector[j], l->s_float.op.translation_table, l, no_threading );
+          trans_back_float( &(l->x), &(l->is_float.test_vector[j]), l->s_float.op.translation_table, l, no_threading );
         else
-          trans_back_double( l->x, l->is_double.test_vector[j], l->s_double.op.translation_table, l, no_threading );
+          trans_back_double( &(l->x), &(l->is_double.test_vector[j]), l->s_double.op.translation_table, l, no_threading );
       } else {
-        vector_double_copy( l->x, ((complex_double*)psi)+j*l->inner_vector_size, 0, l->inner_vector_size, l );
+	vector_double psi_vec;
+	psi_vec.vector_buffer = ((complex_double*)psi)+j*l->inner_vector_size;
+        vector_double_copy( &(l->x), &psi_vec, 0, l->inner_vector_size, l );
       }
-      phi=(double *)(l->x);
+      phi=(double *)(&(l->x));
       phi_pt=phi;
       for ( t=0; t<gl[T]; t++ )
         for ( z=0; z<gl[Z]; z++ )
@@ -941,7 +946,7 @@ void vector_io_single_file( double *psi, double *lambda, char *filename, const i
             for ( x=0; x<gl[X]; x+=ll[X] ) {
               
               desired_rank = process_index( t, z, y, x, ll );
-              
+	              
               if ( g.my_rank == 0 ) {
                 MPI_Irecv( buffer_pt->next->data, bar_size, MPI_DOUBLE, desired_rank, 0, g.comm_cart, &rreq );
               }
diff --git a/src/linalg.c b/src/linalg.c
index 3487404..e884e46 100644
--- a/src/linalg.c
+++ b/src/linalg.c
@@ -21,9 +21,8 @@
 
 #include "main.h"
 
-#ifndef OPTIMIZED_LINALG_float
 void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi,
-                                     vector_float psi, int start, int end, level_struct *l,
+                                     vector_float *psi, int start, int end, level_struct *l,
                                      struct Thread *threading ) {
 
   PROF_float_START( _PIP, threading );
@@ -39,7 +38,7 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_
   compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
   for(int c=0; c<count; c++) {
     for ( i=thread_start; i<thread_end; ) {
-      FOR12( results[c] += (complex_double) conj_float(phi[c][i])*psi[i]; i++; )
+      FOR12( results[c] += (complex_double) conj_float(phi[c].vector_buffer[i])*psi->vector_buffer[i]; i++; )
     }
   }
 
@@ -60,9 +59,30 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_
 
   PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
 }
-#endif
 
-double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) {
+
+void process_multi_inner_product_MP_new( int count, complex_double *results, vector_float *phi,
+                                     vector_float *psi, level_struct *l, struct Thread *threading ) {
+
+  int start, end;
+  compute_core_start_end(0, psi->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+    PROF_float_START( _PIP, threading );
+  
+  int i, j, jj;
+  for(int c=0; c<count; c++)
+   VECTOR_LOOP(j, psi->num_vect, jj, results[c*psi->num_vect+j+jj] = 0.0;)
+
+  for(int c=0; c<count; c++)
+    for ( i=start; i<end; i++ )
+      VECTOR_LOOP(j, psi->num_vect, jj, results[c*psi->num_vect+j+jj] += (complex_double) conj_float(phi[c].vector_buffer[i*psi->num_vect+j+jj])*psi->vector_buffer[i*psi->num_vect+j+jj];)
+  
+  if(thread == 0 && start != end)
+    PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
+}
+
+double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading ) {
   
   PROF_float_START( _GIP, threading );
   
@@ -75,7 +95,7 @@ double global_norm_MP( vector_float x, int start, int end, level_struct *l, stru
   
   SYNC_CORES(threading)
   for ( i=thread_start; i<thread_end; )
-    FOR12( local_alpha += (complex_double) NORM_SQUARE_float(x[i]); i++; )
+    FOR12( local_alpha += (complex_double) NORM_SQUARE_float(x->vector_buffer[i]); i++; )
 
   // sum over cores
   START_NO_HYPERTHREADS(threading)
@@ -109,3 +129,23 @@ double global_norm_MP( vector_float x, int start, int end, level_struct *l, stru
     return sqrt((double)local_alpha);
   }
 }
+
+void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct Thread *threading ) {
+
+  int start, end;
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+    PROF_float_START( _GIP, threading );
+
+  int i, j, jj;
+  VECTOR_LOOP(j, x->num_vect, jj, res[j+jj]=0;)
+  
+  for( i=start; i<end; i++ )
+    VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] += NORM_SQUARE_float(x->vector_buffer[i*x->num_vect+j+jj]);)
+  
+  VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] = (double)sqrt((double)res[j+jj]);)
+  
+  if(thread == 0 && start != end)
+    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
+}
diff --git a/src/linalg.h b/src/linalg.h
index 4182def..80e9514 100644
--- a/src/linalg.h
+++ b/src/linalg.h
@@ -24,16 +24,26 @@
 
   struct Thread;
   
-  void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha,
+  void vector_double_multi_saxpy( vector_double *z, vector_double *V, complex_double *alpha,
                                int sign, int count, int start, int end, level_struct *l );
   
-  void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha,
+  void vector_float_multi_saxpy( vector_float *z, vector_float *V, complex_float *alpha,
                                  int sign, int count, int start, int end, level_struct *l );
-  
+
+  void vector_double_multi_saxpy_new( vector_double *z, vector_double *V, complex_double *alpha,
+                               int sign, int count, level_struct *l, struct Thread *threading );
+
+  void vector_float_multi_saxpy_new( vector_float *z, vector_float *V, complex_float *alpha,
+                                 int sign, int count, level_struct *l, struct Thread *threading );
+
   void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi,
-                                       vector_float psi, int start, int end, level_struct *l,
+                                       vector_float *psi, int start, int end, level_struct *l,
                                        struct Thread *threading );
-                                       
-  double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading );
   
+  void process_multi_inner_product_MP_new( int count, complex_double *results, vector_float *phi,
+                                       vector_float *psi, level_struct *l, struct Thread *threading );
+                                     
+  double global_norm_MP( vector_float *x, int start, int end, level_struct *l, struct Thread *threading );
+
+  void global_norm_MP_new( double *res, vector_float *x, level_struct *l, struct Thread *threading );
 #endif
diff --git a/src/linalg_generic.c b/src/linalg_generic.c
index db223bd..22f520f 100644
--- a/src/linalg_generic.c
+++ b/src/linalg_generic.c
@@ -21,12 +21,7 @@
 
 #include "main.h"
 
-#include "sse_float_intrinsic.h"
-#include "sse_linalg.h"
-#include "sse_linalg_PRECISION.h"
-
-#ifndef OPTIMIZED_LINALG_PRECISION
-complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) {
+complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _GIP, threading );
   complex_PRECISION local_alpha = 0, global_alpha = 0;
@@ -37,7 +32,7 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_P
   
   SYNC_CORES(threading)
   
-  VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += conj_PRECISION(phi[i])*psi[i], i++, l );
+  VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += conj_PRECISION(phi->vector_buffer[i])*psi->vector_buffer[i], i++, l );
 
   // sum over cores
   START_NO_HYPERTHREADS(threading)
@@ -71,10 +66,9 @@ complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_P
     return local_alpha;
   }
 }
-#endif
 
 
-complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) {
+complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _PIP, threading );
   int i;
@@ -82,7 +76,7 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_
   
   SYNC_CORES(threading)
   
-  THREADED_VECTOR_FOR( i, start, end, local_alpha += conj_PRECISION(phi[i])*psi[i], i++, l, threading );
+  THREADED_VECTOR_FOR( i, start, end, local_alpha += conj_PRECISION(phi->vector_buffer[i])*psi->vector_buffer[i], i++, l, threading );
 
   START_NO_HYPERTHREADS(threading)
   ((complex_PRECISION *)threading->workspace)[threading->core] = local_alpha;
@@ -103,8 +97,7 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_
 }
 
 
-#if !defined( OPTIMIZED_LINALG_PRECISION ) 
-void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi,
+void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi,
     int start, int end, level_struct *l, struct Thread *threading ) {
 
   PROF_PRECISION_START( _PIP, threading );
@@ -120,18 +113,18 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result
     compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
     for(int c=0; c<count; c++)
       for ( i=thread_start; i<thread_end; )
-        FOR12( results[c] += conj_PRECISION(phi[c][i])*psi[i]; i++; )
+        FOR12( results[c] += conj_PRECISION(phi[c].vector_buffer[i])*psi->vector_buffer[i]; i++; )
   } else {
 #ifdef _M10TV
     compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 20);
     for(int c=0; c<count; c++)
       for ( i=thread_start; i<thread_end; )
-        FOR20( results[c] += conj_PRECISION(phi[c][i])*psi[i]; i++; )
+        FOR20( results[c] += conj_PRECISION(phi[c].vector_buffer[i])*psi->vector_buffer[i]; i++; )
 #else
     compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2);
     for(int c=0; c<count; c++)
       for ( i=thread_start; i<thread_end; )
-        FOR2( results[c] += conj_PRECISION(phi[c][i])*psi[i]; i++; )
+        FOR2( results[c] += conj_PRECISION(phi[c].vector_buffer[i])*psi->vector_buffer[i]; i++; )
 #endif
   }
 
@@ -152,14 +145,35 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result
 
   PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
 }
-#endif
 
 
-complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l  ) {
+
+void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi,
+    level_struct *l, struct Thread *threading ) {
+
+  int start, end;
+  compute_core_start_end(0, psi->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+    PROF_PRECISION_START( _PIP, threading );
+  
+  int i, j, jj;
+  VECTOR_LOOP(j, count*psi->num_vect, jj, results[j+jj] = 0.0;)
+
+  for(int c=0; c<count; c++)
+    for ( i=start; i<end; i++ )
+      VECTOR_LOOP(j, psi->num_vect, jj, results[c*psi->num_vect+j+jj] += conj_PRECISION(phi[c].vector_buffer[i*psi->num_vect+j+jj])*psi->vector_buffer[i*psi->num_vect+j+jj];)
+
+  if(thread == 0 && start != end)
+    PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
+}
+
+
+complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l  ) {
   
   complex_PRECISION numerator = 0.0; PRECISION denominator = 0.0;
   
-  VECTOR_FOR( int i=start, i<end, numerator += conj_PRECISION(phi[i])*psi[i]; denominator += NORM_SQUARE_PRECISION(phi[i]), i++, l );
+  VECTOR_FOR( int i=start, i<end, numerator += conj_PRECISION(phi->vector_buffer[i])*psi->vector_buffer[i]; denominator += NORM_SQUARE_PRECISION(phi->vector_buffer[i]), i++, l );
   
   if ( abs_PRECISION(denominator) < EPS_PRECISION ) {
     return 0.0;
@@ -168,8 +182,7 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECI
   return numerator/denominator;
 }
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) {
+PRECISION global_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _GIP, threading );
   
@@ -181,7 +194,7 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s
   
   SYNC_CORES(threading)
   
-  VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += NORM_SQUARE_PRECISION(x[i]), i++, l );
+  VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += NORM_SQUARE_PRECISION(x->vector_buffer[i]), i++, l );
 
   // sum over cores
   START_NO_HYPERTHREADS(threading)
@@ -215,9 +228,8 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s
     return (PRECISION)sqrt((double)local_alpha);
   }
 }
-#endif
 
-PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) {
+PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading ) {
      
   int i;
   PRECISION local_alpha = 0;
@@ -225,7 +237,7 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_
   
   SYNC_CORES(threading)
   
-  THREADED_VECTOR_FOR( i, start, end, local_alpha += NORM_SQUARE_PRECISION(x[i]), i++, l, threading );
+  THREADED_VECTOR_FOR( i, start, end, local_alpha += NORM_SQUARE_PRECISION(x->vector_buffer[i]), i++, l, threading );
 
   START_NO_HYPERTHREADS(threading)
   ((PRECISION *)threading->workspace)[threading->core] = local_alpha;
@@ -245,65 +257,130 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_
   return (PRECISION)sqrt((double)local_alpha);
 }
 
+void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struct *l, struct Thread *threading ) {
+  
+  int start, end;
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+  PROF_PRECISION_START( _GIP, threading );
+
+  int i, j, jj;
+  VECTOR_LOOP(j, x->num_vect, jj, res[j+jj]=0;)
+ 
+  for( i=start; i<end; i++)
+    VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] += NORM_SQUARE_PRECISION(x->vector_buffer[i*x->num_vect+j+jj]);)
+  
+  VECTOR_LOOP(j, x->num_vect, jj, res[j+jj] = (PRECISION)sqrt((double)res[j+jj]);)
+
+  if(thread == 0 && start != end)
+  PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
+}
+
 
-void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) {
+void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) {
   
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
   PROF_PRECISION_START( _LA2 );
   
-  VECTOR_FOR( int i=start, i<end, z[i] = x[i] + y[i], i++, l );
+  VECTOR_FOR( int i=start, i<end, z->vector_buffer[i] = x->vector_buffer[i] + y->vector_buffer[i], i++, l );
   
   if(thread == 0 && start != end)
   PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size );
 }
 
 
-void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) {
-  
+void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) {
+
+  int i, j, jj, start, end;
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
   PROF_PRECISION_START( _LA2 );
 
-  VECTOR_FOR( int i=start, i<end, z[i] = x[i] - y[i], i++, l );
-  
+  for( i=start; i<end; i++)
+    VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] + y->vector_buffer[i*x->num_vect+j+jj];)
+
   if(thread == 0 && start != end)
   PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size );
 }
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ) {
+
+void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ) {
+
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+  PROF_PRECISION_START( _LA2 );
+
+  VECTOR_FOR( int i=start, i<end, z->vector_buffer[i] = x->vector_buffer[i] - y->vector_buffer[i], i++, l );
   
+  if(thread == 0 && start != end)
+  PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size );
+}
+
+
+void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading ) {
+
+  int i, j, jj, start, end;
+  compute_core_start_end(0, y->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+  PROF_PRECISION_START( _LA2 );
+
+  for( i=start; i<end; i++)
+    VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - y->vector_buffer[i*x->num_vect+j+jj];)
+
+  if(thread == 0 && start != end)
+  PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size );
+}
+
+void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ) {
+
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
   PROF_PRECISION_START( _LA6 );
   
-  VECTOR_FOR( int i=start, i<end, z[i] = alpha*x[i], i++, l );
+  VECTOR_FOR( int i=start, i<end, z->vector_buffer[i] = alpha*x->vector_buffer[i], i++, l );
   
   if(thread == 0 && start != end)
   PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
 }
-#endif
 
+void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading ) {
+
+  int i, j, jj, start, end;
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+  PROF_PRECISION_START( _LA6 );
+
+  for( i=start; i<end; i++)
+    VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = alpha[k*x->num_vect+j+jj]*x->vector_buffer[i*x->num_vect+j+jj];)
 
-void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha,
+  if(thread == 0 && start != end)
+  PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
+}
+
+
+void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha,
                                   int start, int end, level_struct *l ) {
-  
+
   PRECISION *r_z = (PRECISION*)z, *r_x = (PRECISION*)x, r_alpha = creal_PRECISION(alpha);
   int r_start = 2*start, r_end = 2*end;
-  
+
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
   PROF_PRECISION_START( _LA2 );
-  
+
   REAL_VECTOR_FOR( int i=r_start, i<r_end, r_z[i] = r_alpha*r_x[i], i++, l );
-  
+
   if(thread == 0 && start != end)
   PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size );
 }
 
 
-void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ) {
+void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ) {
   
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
@@ -315,22 +392,41 @@ void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, i
   PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size );
 }
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ) {
-  
+void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ) {
+
   int thread = omp_get_thread_num();
   if (thread == 0 && start != end )
   PROF_PRECISION_START( _LA8 );
   
-  VECTOR_FOR( int i=start, i<end, z[i] = x[i] + alpha*y[i], i++, l );
+  VECTOR_FOR( int i=start, i<end, z->vector_buffer[i] = x->vector_buffer[i] + alpha*y->vector_buffer[i], i++, l );
   
   if( thread == 0 && start != end )
   PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
 }
-#endif
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha,
+// New input variable: sign
+// sign == 1 : plus
+// else: minus
+void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading ) {
+
+  int i, j, jj, start, end;
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if (thread == 0 && start != end )
+  PROF_PRECISION_START( _LA8 );
+
+  if( sign == 1 )
+    for( i=start; i<end; i++)
+      VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] + alpha[k*x->num_vect+j+jj]*y->vector_buffer[i*x->num_vect+j+jj];)
+  else
+    for( i=start; i<end; i++)
+      VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj] - alpha[k*x->num_vect+j+jj]*y->vector_buffer[i*x->num_vect+j+jj];)
+
+  if( thread == 0 && start != end )
+  PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
+}
+
+void vector_PRECISION_multi_saxpy( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha,
                                int sign, int count, int start, int end, level_struct *l ) {
   
   int thread = omp_get_thread_num();
@@ -344,36 +440,58 @@ void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, comp
   
   for ( int c=0; c<count; c++ ) {
     for ( int i=start; i<end; ) {
-      FOR12( z[i] += V[c][i]*alpha_signed[c]; i++; )
+      FOR12( z->vector_buffer[i] += V[c].vector_buffer[i]*alpha_signed[c]; i++; )
     }
   }
   
   if( thread == 0 && start != end )
   PROF_PRECISION_STOP( _LA8, (PRECISION)(count) );
 }
-#endif
 
-void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, 
+void vector_PRECISION_multi_saxpy_new( vector_PRECISION *z, vector_PRECISION *V, complex_PRECISION *alpha,
+                               int sign, int count, level_struct *l, struct Thread *threading ) {
+  
+  int c, i, j, jj, start, end;
+  compute_core_start_end(0, z->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if (thread == 0 && start != end )
+  PROF_PRECISION_START( _LA8 );
+
+  complex_PRECISION alpha_signed[count*z->num_vect];
+  for ( c=0; c<count; c++ )
+    VECTOR_LOOP(j, z->num_vect, jj, alpha_signed[c*z->num_vect+j+jj] = sign*alpha[c*z->num_vect+j+jj];)
+
+  for ( c=0; c<count; c++ )
+    for ( i=start; i<end; i++)
+      VECTOR_LOOP(j, z->num_vect, jj, z->vector_buffer[i*z->num_vect+j+jj] += V[c].vector_buffer[i*z->num_vect+j+jj]*alpha_signed[c];)
+
+  if( thread == 0 && start != end )
+  PROF_PRECISION_STOP( _LA8, (PRECISION)(count) );
+}
+
+void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, 
                                   int orthogonal, level_struct *l, Thread *threading ) {
   
   int j, start, end;
   
   compute_core_start_end( 0, l->inner_vector_size, &start, &end, l, threading );
                     
-  vector_PRECISION v_tmp = NULL, *W_tmp = NULL;
+  vector_PRECISION v_tmp, *W_tmp = NULL;
   complex_PRECISION ip[k], ip_buffer[2*k];      
   
-  MALLOC( v_tmp, complex_PRECISION, l->inner_vector_size );
-  vector_PRECISION_define(v_tmp, 0, 0, l->inner_vector_size, l );
+  vector_PRECISION_init( &v_tmp );
+
+  vector_PRECISION_alloc( &v_tmp, _INNER, 1, l, no_threading );
+  vector_PRECISION_define( &v_tmp, 0, 0, l->inner_vector_size, l );
   
-  MALLOC( W_tmp, complex_PRECISION*, k );
-  W_tmp[0] = NULL; 
-  MALLOC( W_tmp[0], complex_PRECISION, k*l->inner_vector_size );
-  for ( j = 1; j<k; j++ )
-    W_tmp[j] = W_tmp[0]+j*l->inner_vector_size;
+  MALLOC( W_tmp, vector_PRECISION, k );
+  for ( j = 0; j<k; j++ ){
+    vector_PRECISION_init( &W_tmp[j] );
+    vector_PRECISION_alloc( &W_tmp[j], _INNER, 1, l, no_threading );
+  }
   
   for ( j=0; j<k; j++ ) {
-   vector_PRECISION_scale( W_tmp[j], W[j], diag[j], 0, l->inner_vector_size, l );
+   vector_PRECISION_scale( &W_tmp[j], W+j, diag[j], 0, l->inner_vector_size, l );
   }
   process_multi_inner_product_PRECISION( k, ip, W_tmp, v, 0, l->inner_vector_size, l, threading );
   
@@ -385,16 +503,18 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k,
   END_MASTER(threading)
   SYNC_MASTER_TO_ALL(threading)  
   
-  vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l );
+  vector_PRECISION_multi_saxpy( &v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l );
    
   if (orthogonal) 
-    vector_PRECISION_minus( z, v, v_tmp, 0, l->inner_vector_size, l );
+    vector_PRECISION_minus( z, v, &v_tmp, 0, l->inner_vector_size, l );
   else
-    vector_PRECISION_copy( z, v_tmp, 0, l->inner_vector_size, l );
+    vector_PRECISION_copy( z, &v_tmp, 0, l->inner_vector_size, l );
   
-  FREE( v_tmp, complex_PRECISION, l->inner_vector_size );
-  FREE( W_tmp[0], complex_PRECISION, k*l->inner_vector_size );
-  FREE( W_tmp, complex_PRECISION*, k );
+  vector_PRECISION_free( &v_tmp, l, no_threading );
+  for ( j = 0; j<k; j++ ){ 
+    vector_PRECISION_free( &W_tmp[j], l, no_threading);
+  }
+  FREE( W_tmp, vector_PRECISION, k );
 }
 
 void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vect, level_struct *l, struct Thread *threading ) {
@@ -411,23 +531,23 @@ void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_ve
       
   for ( j=threading->n_thread*threading->core+threading->thread; j<num_aggregates; j+=threading->n_thread*threading->n_core ) {
     for ( k1=0; k1<num_vect; k1++ ) {
-      v_pt1 = V[k1] + j*aggregate_size;
+      v_pt1.vector_buffer = V[k1].vector_buffer + j*aggregate_size;
       
       for ( k2=0; k2<k1; k2++ ) {
-        v_pt2 = V[k2] + j*aggregate_size;
+        v_pt2.vector_buffer = V[k2].vector_buffer + j*aggregate_size;
         alpha1 = 0; alpha2 = 0;
         // V[k1] -= <V[k2],V[k1]> V[k2] | 2*j-th and 2*j+1-st aggregate
         for ( i=0; i<aggregate_size; ) {
           for ( k=0; k<offset; k++, i++ )
-            alpha1 += conj_PRECISION(v_pt2[i]) * v_pt1[i];
+            alpha1 += conj_PRECISION(v_pt2.vector_buffer[i]) * v_pt1.vector_buffer[i];
           for ( k=0; k<offset; k++, i++ )
-            alpha2 += conj_PRECISION(v_pt2[i]) * v_pt1[i];
+            alpha2 += conj_PRECISION(v_pt2.vector_buffer[i]) * v_pt1.vector_buffer[i];
         }
         for ( i=0; i<aggregate_size; ) {
           for ( k=0; k<offset; k++, i++ )
-            v_pt1[i] -=  alpha1 * v_pt2[i];
+            v_pt1.vector_buffer[i] -=  alpha1 * v_pt2.vector_buffer[i];
           for ( k=0; k<offset; k++, i++ )
-            v_pt1[i] -=  alpha2 * v_pt2[i];
+            v_pt1.vector_buffer[i] -=  alpha2 * v_pt2.vector_buffer[i];
         }
       }
       
@@ -435,16 +555,16 @@ void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_ve
       // V[k1] = V[k1]/norm(V[k1]) | 2*j-th and 2*j+1-st aggregate    
       for ( i=0; i<aggregate_size; ) {
         for ( k=0; k<offset; k++, i++ )
-          norm1 += NORM_SQUARE_PRECISION(v_pt1[i]);
+          norm1 += NORM_SQUARE_PRECISION(v_pt1.vector_buffer[i]);
         for ( k=0; k<offset; k++, i++ )
-          norm2 += NORM_SQUARE_PRECISION(v_pt1[i]);
+          norm2 += NORM_SQUARE_PRECISION(v_pt1.vector_buffer[i]);
       }
       norm1 = 1/sqrt(norm1); norm2 = 1/sqrt(norm2);
       for ( i=0; i<aggregate_size; ) {
         for ( k=0; k<offset; k++, i++ )
-          v_pt1[i] =  norm1 * creal_PRECISION(v_pt1[i]) + I*norm1* cimag_PRECISION(v_pt1[i]);
+          v_pt1.vector_buffer[i] =  norm1 * creal_PRECISION(v_pt1.vector_buffer[i]) + I*norm1* cimag_PRECISION(v_pt1.vector_buffer[i]);
         for ( k=0; k<offset; k++, i++ )
-          v_pt1[i] =  norm2 * creal_PRECISION(v_pt1[i]) + I*norm2* cimag_PRECISION(v_pt1[i]);
+          v_pt1.vector_buffer[i] =  norm2 * creal_PRECISION(v_pt1.vector_buffer[i]) + I*norm2* cimag_PRECISION(v_pt1.vector_buffer[i]);
       }
     }
   }
@@ -454,26 +574,26 @@ void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_ve
 }
 
 
-void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, complex_PRECISION alpha,
+void spinwise_PRECISION_skalarmultiply( vector_PRECISION *eta1, vector_PRECISION *eta2, vector_PRECISION *phi, complex_PRECISION alpha,
                                         int start, int end, level_struct *l ) {
   
   PROF_PRECISION_START( _LA6 );  
   for ( int i=start; i<end; ) {
-    FOR6( eta1[i] = alpha*phi[i]; eta2[i] = _COMPLEX_PRECISION_ZERO; i++; )
-    FOR6( eta2[i] = alpha*phi[i]; eta1[i] = _COMPLEX_PRECISION_ZERO; i++; )
+    FOR6( eta1->vector_buffer[i] = alpha*phi->vector_buffer[i]; eta2->vector_buffer[i] = _COMPLEX_PRECISION_ZERO; i++; )
+    FOR6( eta2->vector_buffer[i] = alpha*phi->vector_buffer[i]; eta1->vector_buffer[i] = _COMPLEX_PRECISION_ZERO; i++; )
   }
   PROF_PRECISION_STOP( _LA6, 1 );
 }
 
 
-void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) {
+void set_boundary_PRECISION( vector_PRECISION *phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _SET, threading );
   int i;
   
   SYNC_CORES(threading)
   
-  THREADED_VECTOR_FOR( i, l->inner_vector_size, l->vector_size, phi[i] = alpha, i++, l, threading );
+  THREADED_VECTOR_FOR( i, l->inner_vector_size, l->vector_size, phi->vector_buffer[i] = alpha, i++, l, threading );
   
   SYNC_CORES(threading)
   PROF_PRECISION_STOP( _SET, (double)(l->vector_size-l->inner_vector_size)/(double)l->inner_vector_size, threading );
@@ -496,7 +616,7 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con
   for ( i=begin; i<n; i++ ) {
     
     complex_PRECISION tmp[i];
-    process_multi_inner_product_PRECISION( i, tmp, V, V[i], 0, l->inner_vector_size, l, threading );
+    process_multi_inner_product_PRECISION( i, tmp, V, &V[i], 0, l->inner_vector_size, l, threading );
     SYNC_CORES(threading)
     START_MASTER(threading)
     for ( j=0; j<i; j++ ) {
@@ -515,15 +635,15 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con
     }
     
     for( j=0; j<i; j++ ) {
-      vector_PRECISION_saxpy( V[i], V[i], V[j], -(buffer+n)[j], start, end, l );
+      vector_PRECISION_saxpy( &V[i], &V[i], &V[j], -(buffer+n)[j], start, end, l );
       SYNC_CORES(threading)
     }
     
     SYNC_CORES(threading)
       
-    beta = global_norm_PRECISION( V[i], 0, l->inner_vector_size, l, threading );
+    beta = global_norm_PRECISION( &V[i], 0, l->inner_vector_size, l, threading );
     SYNC_MASTER_TO_ALL(threading)
-    vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l );
+    vector_PRECISION_real_scale( &V[i], &V[i], creal(1.0/beta), start, end, l );
     SYNC_CORES(threading)
   }
   
@@ -534,7 +654,6 @@ void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, con
 }
 
 
-#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION )
 void setup_gram_schmidt_PRECISION_compute_dots(
     complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
     int start, int end, level_struct *l, struct Thread *threading) {
@@ -543,6 +662,8 @@ void setup_gram_schmidt_PRECISION_compute_dots(
   int thread_end;
   int cache_block_size = 12*64;
   complex_PRECISION tmp[cache_block_size];
+  vector_PRECISION tmp_vect;
+  tmp_vect.vector_buffer = tmp;
 
   for(int i=0; i<2*offset; i++)
     thread_buffer[i] = 0.0;
@@ -551,11 +672,11 @@ void setup_gram_schmidt_PRECISION_compute_dots(
   compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
   
   for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    coarse_gamma5_PRECISION( tmp, V[count]+i, 0, cache_block_size, l );
+    coarse_gamma5_PRECISION( &tmp_vect, &V[count]+i, 0, cache_block_size, l );
     for ( int j=0; j<count; j++ ) {
       for ( int k=0; k<cache_block_size; k++) {
-        thread_buffer[j]   += conj_PRECISION(V[j][i+k])*V[count][i+k];
-        thread_buffer[j+offset] += conj_PRECISION(V[j][i+k])*tmp[k];
+        thread_buffer[j]   += conj_PRECISION(V[j].vector_buffer[i+k])*V[count].vector_buffer[i+k];
+        thread_buffer[j+offset] += conj_PRECISION(V[j].vector_buffer[i+k])*tmp[k];
       }
     }
   }
@@ -575,10 +696,8 @@ void setup_gram_schmidt_PRECISION_compute_dots(
   END_MASTER(threading)
   // only master needs the result in this case (it will be distributed later)
 }
-#endif
 
 
-#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION )
 void setup_gram_schmidt_PRECISION_axpys(
     complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
     int start, int end, level_struct *l, struct Thread *threading) {
@@ -587,23 +706,24 @@ void setup_gram_schmidt_PRECISION_axpys(
   int thread_end;
   int cache_block_size = 12*64;
   complex_PRECISION tmp[cache_block_size];
+  vector_PRECISION tmp_vect;
+  tmp_vect.vector_buffer = tmp;
 
   compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
 
   for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
     for ( int j=0; j<count; j++ ) {
-      coarse_gamma5_PRECISION( tmp, V[j]+i, 0, cache_block_size, l );
+      coarse_gamma5_PRECISION( &tmp_vect, &V[j]+i, 0, cache_block_size, l );
       for ( int k=0; k<cache_block_size; k++) {
-        V[count][i+k] -= thread_buffer[2*offset+j]*V[j][i+k];
-        V[count][i+k] -= thread_buffer[3*offset+j]*tmp[k];
+        V[count].vector_buffer[i+k] -= thread_buffer[2*offset+j]*V[j].vector_buffer[i+k];
+        V[count].vector_buffer[i+k] -= thread_buffer[3*offset+j]*tmp[k];
       }
     }
   }
 }
-#endif
 
 
-void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v,
+void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION *g5v,
                                    complex_PRECISION *buffer, const int n, level_struct *l,
                                    struct Thread *threading ) {
   
@@ -623,10 +743,10 @@ void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v,
   for ( i=0; i<n; i++ ) {
     
     if ( l->depth > 0 ) {
-      coarse_gamma5_PRECISION( g5v, V[i], thread_start, thread_end, l );
+      coarse_gamma5_PRECISION( g5v, &V[i], thread_start, thread_end, l );
       for ( j=0; j<i; j++ ) {
-        thread_buffer[j] = process_inner_product_PRECISION( V[j], V[i], start, end, l, threading );
-        thread_buffer[j+n] = process_inner_product_PRECISION( V[j], g5v, start, end, l, threading );
+        thread_buffer[j] = process_inner_product_PRECISION( &V[j], &V[i], start, end, l, threading );
+        thread_buffer[j+n] = process_inner_product_PRECISION( &V[j], g5v, start, end, l, threading );
       }
     }
     else
@@ -648,16 +768,16 @@ void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v,
     
     if ( l->depth > 0 ) {
       for( j=0; j<i; j++ ) {
-        vector_PRECISION_saxpy( V[i], V[i], V[j], -(thread_buffer+2*n)[j], thread_start, thread_end, l );
-        coarse_gamma5_PRECISION( g5v, V[j], thread_start, thread_end, l );
-        vector_PRECISION_saxpy( V[i], V[i], g5v, -(thread_buffer+3*n)[j], thread_start, thread_end, l );
+        vector_PRECISION_saxpy( &V[i], &V[i], &V[j], -(thread_buffer+2*n)[j], thread_start, thread_end, l );
+        coarse_gamma5_PRECISION( g5v, &V[j], thread_start, thread_end, l );
+        vector_PRECISION_saxpy( &V[i], &V[i], g5v, -(thread_buffer+3*n)[j], thread_start, thread_end, l );
       }
     } else {
       setup_gram_schmidt_PRECISION_axpys( thread_buffer, V, i, n, start, end, l, threading);
     }
     
-    beta = global_norm_PRECISION( V[i], start, end, l, threading );
-    vector_PRECISION_real_scale( V[i], V[i], 1.0/beta, thread_start, thread_end, l );
+    beta = global_norm_PRECISION( &V[i], start, end, l, threading );
+    vector_PRECISION_real_scale( &V[i], &V[i], 1.0/beta, thread_start, thread_end, l );
   }
   PROF_PRECISION_STOP( _GRAM_SCHMIDT, (double)(end-start)/(double)l->inner_vector_size, threading );
 }
diff --git a/src/linalg_generic.h b/src/linalg_generic.h
index 9bd7a20..9f6f7be 100644
--- a/src/linalg_generic.h
+++ b/src/linalg_generic.h
@@ -99,24 +99,31 @@
 
   struct Thread;
 
-  complex_PRECISION global_inner_product_PRECISION( vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l, struct Thread *threading );
-  complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading );
+  complex_PRECISION global_inner_product_PRECISION( vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l, struct Thread *threading );
+  complex_PRECISION process_inner_product_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l, struct Thread *threading );
 
-  void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi,
+  void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi,
       int start, int end, level_struct *l, struct Thread *threading );
+  void process_multi_inner_product_PRECISION_new( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION *psi,
+      level_struct *l, struct Thread *threading );
 
-  PRECISION global_norm_PRECISION( vector_PRECISION phi, int start, int end, level_struct *l, struct Thread *threading );
-  PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading );
-  
-  complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l  );
-  void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x + y
-  void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x - y
-  void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x
-  void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha,
+  PRECISION global_norm_PRECISION( vector_PRECISION *phi, int start, int end, level_struct *l, struct Thread *threading );
+  PRECISION process_norm_PRECISION( vector_PRECISION *x, int start, int end, level_struct *l, struct Thread *threading );
+  void global_norm_PRECISION_new( PRECISION *res, vector_PRECISION *x, level_struct *l, struct Thread *threading );
+ 
+  complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION *phi, vector_PRECISION *psi, int start, int end, level_struct *l  );
+  void vector_PRECISION_plus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x + y
+  void vector_PRECISION_plus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_minus( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, int start, int end, level_struct *l ); // z := x - y
+  void vector_PRECISION_minus_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x
+  void vector_PRECISION_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, int k, level_struct *l, struct Thread *threading );
+  void buffer_PRECISION_real_scale( complex_PRECISION *z, complex_PRECISION *x, complex_PRECISION alpha,
                                     int start, int end, level_struct *l );
-  void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y
-  void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ); // z := x
-  void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, 
+  void vector_PRECISION_saxpy( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y
+  void vector_PRECISION_saxpy_new( vector_PRECISION *z, vector_PRECISION *x, vector_PRECISION *y, complex_PRECISION *alpha, int k, int sign, level_struct *l, struct Thread *threading );
+  void buffer_PRECISION_copy( complex_PRECISION *z, complex_PRECISION *x, int start, int end, level_struct *l ); // z := x
+  void vector_PRECISION_projection( vector_PRECISION *z, vector_PRECISION *v, int k, vector_PRECISION *W, complex_PRECISION *diag, 
                                   int orthogonal, level_struct *l, Thread *threading );
   
   void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
@@ -135,11 +142,11 @@
       int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
 
   void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n, level_struct *l, struct Thread *threading );
-  void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v,
+  void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION *g5v,
                                      complex_PRECISION *buffer, const int n, level_struct *l,
                                      struct Thread *threading );
-  void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2,
-                                          vector_PRECISION phi, complex_PRECISION alpha, int start, int end, level_struct *l );
-  void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading );
+  void spinwise_PRECISION_skalarmultiply( vector_PRECISION *eta1, vector_PRECISION *eta2,
+                                          vector_PRECISION *phi, complex_PRECISION alpha, int start, int end, level_struct *l );
+  void set_boundary_PRECISION( vector_PRECISION *phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading );
   
 #endif
diff --git a/src/linsolve.c b/src/linsolve.c
index bc24c81..df5eca2 100644
--- a/src/linsolve.c
+++ b/src/linsolve.c
@@ -28,10 +28,10 @@ void fgmres_MP_struct_init( gmres_MP_struct *p ) {
 }
 
 
-void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int prec_kind, 
+void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const int prec_kind, 
                              void (*precond)(), gmres_MP_struct *p, level_struct *l ) {
   long int total=0; 
-  int i, k=0;
+  int i, k=0, n_vl=g.num_rhs_vect;
   
   p->dp.restart_length = m;                      p->sp.restart_length = m;           
   p->dp.num_restart = n;                         p->sp.num_restart = n;
@@ -39,7 +39,7 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr
   if ( g.method == 6 ) {
   p->dp.eval_operator = g5D_plus_clover_double;  p->sp.eval_operator = g5D_plus_clover_float;
   } else {
-  p->dp.eval_operator = d_plus_clover_double;    p->sp.eval_operator = d_plus_clover_float;
+  p->dp.eval_operator = d_plus_clover_double_new; p->sp.eval_operator = d_plus_clover_float_new;
   }
   p->dp.tol = tol;                               p->sp.tol = MAX(tol,1E-5);
   p->dp.kind = _NOTHING;                         p->sp.kind = prec_kind;
@@ -56,19 +56,18 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr
   if ( g.method == 6 ) {
     g.p.eval_operator = g5D_plus_clover_double;
   } else {
-    g.p.eval_operator = d_plus_clover_double;
+    g.p.eval_operator = d_plus_clover_double_new;
   }
   
 #ifdef HAVE_TM1p1
-  vl*=2;
+  n_vl*=2;
 #endif
 
   // double precision part
   total = 0;
-  total += (m+1)*m; // Hessenberg matrix
+  total += (m+1)*m*n_vl; // Hessenberg matrix
   MALLOC( p->dp.H, complex_double*, m );
-  total += 4*(m+1); // y, gamma, c, s
-  total += 3*vl;    // x, r, b
+  total += 4*(m+1)*n_vl; // y, gamma, c, s
   p->dp.total_storage = total;
   // precomputed storage amount
   
@@ -79,58 +78,54 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr
   total = 0;
   // H
   for ( i=1; i<m; i++ )
-    p->dp.H[i] = p->dp.H[0] + i*(m+1);
-  total += m*(m+1);
+    p->dp.H[i] = p->dp.H[0] + i*(m+1)*n_vl;
+  total += m*(m+1)*n_vl;
   // y
-  p->dp.y = p->dp.H[0] + total; total += m+1;
+  p->dp.y = p->dp.H[0] + total; total += (m+1)*n_vl;
   // gamma
-  p->dp.gamma = p->dp.H[0] + total; total += m+1;
+  p->dp.gamma = p->dp.H[0] + total; total += (m+1)*n_vl;
   // c
-  p->dp.c = p->dp.H[0] + total; total += m+1;
+  p->dp.c = p->dp.H[0] + total; total += (m+1)*n_vl;
   // s
-  p->dp.s = p->dp.H[0] + total; total += m+1;
+  p->dp.s = p->dp.H[0] + total; total += (m+1)*n_vl;
   // x
-  p->dp.x = p->dp.H[0] + total; total += vl;
+  vector_double_alloc( &(p->dp.x), vl_type, n_vl, l, no_threading );
   // r
-  p->dp.r = p->dp.H[0] + total; total += vl;
+  vector_double_alloc( &(p->dp.r), vl_type, n_vl, l, no_threading );
   // b
-  p->dp.b = p->dp.H[0] + total; total += vl;  
+  vector_double_alloc( &(p->dp.b), vl_type, n_vl, l, no_threading );
   
   ASSERT( p->dp.total_storage == total );
   
   
   // single precision part
   total = 0;
-  total += (2+m)*vl; // w, V
-  MALLOC( p->sp.V, complex_float*, m+1 );
+  MALLOC( p->sp.V, vector_float, m+1 );
   if ( precond != NULL ) {
     if ( prec_kind == _RIGHT ) {
-      total += (m+1)*vl; // Z
       k = m+1;
     } else {
-      total += vl;
       k = 1;
     }
-    MALLOC( p->sp.Z, complex_float*, k );
+    MALLOC( p->sp.Z, vector_float, k );
   }
   p->sp.total_storage = total;
   // precomputed storage amount
   
-  p->sp.w = NULL;
-  MALLOC( p->sp.w, complex_float, total );
-  
   // reserve storage
   total = 0;
   // w
-  p->sp.w = p->sp.w + total; total += vl;
+  vector_float_alloc( &(p->sp.w), vl_type, n_vl, l, no_threading );
   // V 
   for ( i=0; i<m+1; i++ ) {
-    p->sp.V[i] = p->sp.w + total; total += vl;
+    vector_float_init( &(p->sp.V[i]) );
+    vector_float_alloc( &(p->sp.V[i]), vl_type, n_vl, l, no_threading );
   }
   // Z
   if ( precond != NULL ) {
     for ( i=0; i<k; i++ ) {
-      p->sp.Z[i] = p->sp.w + total; total += vl;
+      vector_float_init( &(p->sp.Z[i]) );
+      vector_float_alloc( &(p->sp.Z[i]), vl_type, n_vl, l, no_threading );
     }
   }
   
@@ -138,18 +133,20 @@ void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int pr
 }  
    
    
-void fgmres_MP_struct_free( gmres_MP_struct *p ) {
+void fgmres_MP_struct_free( gmres_MP_struct *p, level_struct *l ) {
    
   // single precision
-  FREE( p->sp.w, complex_float, p->sp.total_storage );
-  FREE( p->sp.V, complex_float*, p->sp.restart_length+1 );
+  vector_float_free( &(p->sp.w), l, no_threading );
+  FREE( p->sp.V, vector_float, p->sp.restart_length+1 );
   if ( p->sp.Z != NULL )
-    FREE( p->sp.Z, complex_float*, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 );
+    FREE( p->sp.Z, vector_float, p->sp.kind==_RIGHT?p->sp.restart_length+1:1 );
   
   // double precision
   FREE( p->dp.H[0], complex_double, p->dp.total_storage );
   FREE( p->dp.H, complex_double*, p->dp.restart_length );
-  
+  vector_double_free( &(p->dp.x), l, no_threading );
+  vector_double_free( &(p->dp.r), l, no_threading );
+  vector_double_free( &(p->dp.b), l, no_threading );
 } 
   
   
@@ -166,11 +163,17 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
   int start;
   int end;
   
-  int j=-1, finish=0, iter=0, il, ol;
-  complex_double gamma0 = 0;
-  complex_double beta = 0;
+  int j=-1, finish=0, iter=0, il, ol, n_vect=g.num_rhs_vect, i, jj;
+  complex_double gamma0[n_vect];//gamma0=0;
+  double beta[n_vect]; //beta=0;
 
-  double norm_r0=1, gamma_jp1=1, t0=0, t1=0;
+  double t0=0, t1=0;
+  double norm_r0[n_vect], gamma_jp1[n_vect], gamma0_real[n_vect], gamma_tot, H_tot, gamma_tot2;//norm_r0=1, gamma_jp1=1
+  complex_float gamma_float[n_vect];
+  
+  VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj]=1;
+                             gamma_jp1[i+jj]=1;)
+  
   START_LOCKED_MASTER(threading)
 #ifndef WILSON_BENCHMARK
   if ( l->depth==0 && ( p->dp.timing || p->dp.print ) ) prof_init( l );
@@ -185,97 +188,124 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
   SYNC_MASTER_TO_ALL(threading)
   // compute start and end indices for core
   // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
-  compute_core_start_end(p->dp.v_start, p->dp.v_end, &start, &end, l, threading);
+  //compute_core_start_end(p->dp.v_start, p->dp.v_end, &start, &end, l, threading);
   
   // Outer loop in double precision
   for( ol=0; ol<p->dp.num_restart && finish==0; ol++ )  {
-  
+    
     if( ol == 0 && p->dp.initial_guess_zero ) {
-      vector_double_copy( p->dp.r, p->dp.b, start, end, l );
+      //vector_double_copy( &(p->dp.r), &(p->dp.b), start, end, l );
+      vector_double_copy_new( &(p->dp.r), &(p->dp.b), l, threading );
     } else {
-      apply_operator_double( p->dp.r, p->dp.x, &(p->dp), l, threading ); // compute r <- D*x
-      vector_double_minus( p->dp.r, p->dp.b, p->dp.r, start, end, l ); // compute r <- b - r
+      apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading ); // compute r <- D*x
+      //vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l ); // compute r <- b - r
+      vector_double_minus_new( &(p->dp.r), &(p->dp.b), &(p->dp.r), l, threading );
     }
-    gamma0 = (complex_double) global_norm_double( p->dp.r, p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r)
+    //gamma0 = (complex_double) global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading ); // gamma_0 = norm(r)
+    global_norm_double_new( gamma0_real, &(p->dp.r), l, threading );
+    VECTOR_LOOP(i, n_vect, jj, gamma0[i+jj]=gamma0_real[i+jj];)
+
     START_MASTER(threading)
-    p->dp.gamma[0] = gamma0;
+    //p->dp.gamma[0] = gamma0;
+    VECTOR_LOOP(i, n_vect, jj, p->dp.gamma[i+jj] = gamma0[i+jj];)
+    
     END_MASTER(threading)
     SYNC_MASTER_TO_ALL(threading)
     
     if( ol == 0) {
      if (l->depth == 0 && !p->dp.initial_guess_zero) {
-       norm_r0 = global_norm_double( p->dp.b, start, end, l, threading );
-       printf0("| initial guess relative residual:            %le |\n", creal(gamma0)/norm_r0);
+       //norm_r0 = global_norm_double( &(p->dp.b), start, end, l, threading );
+       global_norm_double_new( norm_r0, &(p->dp.b), l, threading );
+       for( i=0; i<n_vect; i++ ) 
+         printf0("| initial guess relative residual (%d):            %le |\n", i, creal(gamma0[i])/norm_r0[i]);
      } else {
-       norm_r0 = creal(gamma0);
+       //norm_r0 = creal(gamma0);
+       VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj]= creal(gamma0[i+jj]);)
      }
-    } 
-#if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
+    }
+/*#if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
     else {
       if ( p->dp.print && g.print > 0 ) {
         START_MASTER(threading)
         printf0("+----------------------------------------------------------+\n");
-        printf0("| restarting ...          true residual norm: %6e |\n", creal(gamma0)/norm_r0 );
+        for( i=0; i<n_vect; i++ ) 
+          printf0("| restarting ...          true residual norm (%d): %6e |\n", i, creal(gamma0[i])/norm_r0[i] );
         printf0("+----------------------------------------------------------+\n");
         END_MASTER(threading)
       }
     }
-#endif
-    
-    trans_float( p->sp.V[0], p->dp.r, l->s_float.op.translation_table, l, threading );
-    vector_float_real_scale( p->sp.V[0], p->sp.V[0], (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0
-    
+#endif*/
+    trans_float_new( &(p->sp.V[0]), &(p->dp.r), l->s_float.op.translation_table, l, threading );
+    //vector_float_real_scale( &(p->sp.V[0]), &(p->sp.V[0]), (float)(1/p->dp.gamma[0]), start, end, l ); // V[0] <- r / gamma_0
+    VECTOR_LOOP(i, n_vect, jj, gamma_float[i+jj]= (complex_float) p->dp.gamma[0*n_vect+i+jj];)
+    vector_float_real_scale_new( &(p->sp.V[0]), &(p->sp.V[0]), gamma_float, 0, 1, l, threading );
     // inner loop in single precision
     for( il=0; il<p->dp.restart_length && finish==0; il++) {
       j = il; iter++;
-      arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading );
-      
-      if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) {
+      arnoldi_step_MP_new( p->sp.V, p->sp.Z, &(p->sp.w), p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading );
+      H_tot=0;
+      VECTOR_LOOP(i, n_vect, jj, H_tot += cabs( p->dp.H[j][(j+1)*n_vect+i+jj] );)
+      //if ( cabs( p->dp.H[j][j+1] ) > 1E-15 )
+      if ( H_tot > n_vect*1E-15 ) {
         qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading );
-        gamma_jp1 = cabs( p->dp.gamma[j+1] );          
-        
+        //gamma_jp1 = cabs( p->dp.gamma[j+1] );          
+        VECTOR_LOOP(i, n_vect, jj, gamma_jp1[i+jj] = cabs( p->dp.gamma[(j+1)*n_vect+i+jj] );)
+
         if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) {
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
           START_MASTER(threading)
           if ( p->sp.print && g.print > 0 )
-            printf0("| approx. rel. res. after  %-6d iterations: %e |\n", iter, gamma_jp1/norm_r0 );
+            for( i=0; i<n_vect; i++ )
+              printf0("| vector %d, approx. rel. res. after  %-6d iterations: %e |\n", i, iter, gamma_jp1[i]/norm_r0[i] );
           END_MASTER(threading)
 #endif
         }
-        if( gamma_jp1/norm_r0 < p->dp.tol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop
+        gamma_tot=0;
+        VECTOR_LOOP(i, n_vect, jj, gamma_tot += gamma_jp1[i+jj]/norm_r0[i+jj];)
+
+        //if( gamma_jp1/norm_r0 < p->dp.tol || gamma_jp1/norm_r0 > 1E+5 )  // if satisfied ... stop
+        if( gamma_tot < n_vect*p->dp.tol || gamma_tot > n_vect*1E+5 ) {
           finish = 1;
           START_MASTER(threading)
-            if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_MP, iter = %d, level=%d\n", iter, l->level );
+            if ( gamma_tot > n_vect*1E+5 ) printf0("Divergence of fgmres_MP, iter = %d, level=%d\n", iter, l->level );
           END_MASTER(threading)
         }
-        if( gamma_jp1/creal(gamma0) < p->sp.tol )
+        gamma_tot2=0;
+        VECTOR_LOOP(i, n_vect, jj, gamma_tot2 += gamma_jp1[i+jj]/creal(gamma0[i+jj]);)
+        //if( gamma_jp1/creal(gamma0) < p->sp.tol )
+        if( gamma_tot2 < n_vect*p->sp.tol ){  
           break;
+        }
       } else {
         finish = 1;
       }
     } // end of a single restart
-    compute_solution_MP( p->sp.w, (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V,
+    compute_solution_MP_new( &(p->sp.w), (p->sp.preconditioner&&p->sp.kind==_RIGHT)?p->sp.Z:p->sp.V,
                          p->dp.y, p->dp.gamma, p->dp.H, j, &(p->sp), l, threading );
-                                
-    trans_back_float( p->dp.r, p->sp.w, l->s_float.op.translation_table, l, threading );
+                           
+    trans_back_float_new( &(p->dp.r), &(p->sp.w), l->s_float.op.translation_table, l, threading );
     if ( ol == 0 ) {
-      vector_double_copy( p->dp.x, p->dp.r, start, end, l );
+      //vector_double_copy( &(p->dp.x), &(p->dp.r), start, end, l );
+      vector_double_copy_new(&(p->dp.x), &(p->dp.r), l, threading);
     } else {
-      vector_double_plus( p->dp.x, p->dp.x, p->dp.r, start, end, l );
+      //vector_double_plus( &(p->dp.x), &(p->dp.x), &(p->dp.r), start, end, l );
+      vector_double_plus_new( &(p->dp.x), &(p->dp.x), &(p->dp.r), l, threading );
     }
   } // end of fgmres
   
   START_LOCKED_MASTER(threading)
-  if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_jp1/norm_r0; }
+  if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_tot; }
   END_LOCKED_MASTER(threading)
   
   if ( p->dp.print ) {
 #ifdef FGMRES_RESTEST
-    apply_operator_double( p->dp.r, p->dp.x, &(p->dp), l, threading );
-    vector_double_minus( p->dp.r, p->dp.b, p->dp.r, start, end, l );
-    beta = global_norm_double( p->dp.r, p->dp.v_start, p->dp.v_end, l, threading );
+    apply_operator_double( &(p->dp.r), &(p->dp.x), &(p->dp), l, threading );
+    //vector_double_minus( &(p->dp.r), &(p->dp.b), &(p->dp.r), start, end, l );
+    vector_double_minus_new( &(p->dp.r), &(p->dp.b), &(p->dp.r), l, threading ); 
+    //beta = global_norm_double( &(p->dp.r), p->dp.v_start, p->dp.v_end, l, threading );
+    global_norm_double_new( beta, &(p->dp.r), l, threading );
 #else
-    beta = gamma_jp1;
+    VECTOR_LOOP(i, n_vect, jj, beta[i+jj] = creal(gamma_jp1[i+jj]);)
 #endif
     START_MASTER(threading)
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
@@ -284,7 +314,8 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
     printf0("+----------------------------------------------------------+\n");
     printf0("|    FGMRES MP iterations: %-6d coarse average: %-6.2lf   |\n", iter,
             ((double)g.coarse_iter_count)/((double)iter) );
-    printf0("| exact relative residual: ||r||/||b|| = %e      |\n", creal(beta)/norm_r0 );
+    for( i=0; i<n_vect; i++ )
+      printf0("| exact relative residual %d: ||r||/||b|| = %e    |\n",i, beta[i]/norm_r0[i] );
     printf0("| elapsed wall clock time: %-8.4lf seconds                |\n", t1-t0 );
     if ( g.coarse_time > 0 ) 
       printf0("|        coarse grid time: %-8.4lf seconds (%04.1lf%%)        |\n",
@@ -321,7 +352,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
 }
 
 
-void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
+void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float *w,
                       complex_double **H, complex_double* buffer, int j, void (*prec)(),
                       gmres_float_struct *p, level_struct *l, struct Thread *threading ) {
   
@@ -337,19 +368,19 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
   
   if ( prec != NULL ) {
     if ( p->kind == _LEFT ) {
-      apply_operator_float( Z[0], V[j], p, l, threading );
-      prec( w, NULL, Z[0], _NO_RES, l, threading );
+      apply_operator_float( &Z[0], &V[j], p, l, threading );
+      prec( w, NULL, &Z[0], _NO_RES, l, threading );
     } else {
       if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
-        prec( Z[j], w, V[j], _NO_RES, l, threading );
+        prec( &Z[j], w, &V[j], _NO_RES, l, threading );
         // obtains w = D * Z[j] from Schwarz
       } else {
-        prec( Z[j], NULL, V[j], _NO_RES, l, threading );
-        apply_operator_float( w, Z[j], p, l, threading ); // w = D*Z[j]
+        prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+        apply_operator_float( w, &Z[j], p, l, threading ); // w = D*Z[j]
       }
     }
   } else {
-    apply_operator_float( w, V[j], p, l, threading ); // w = D*V[j]
+    apply_operator_float( w, &V[j], p, l, threading ); // w = D*V[j]
   }
 
   complex_double tmp[j+1];
@@ -383,11 +414,93 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
   
   // V_j+1 = w / H_j+1,j
   if ( cabs_double( H[j][j+1] ) > 1e-15 )
-    vector_float_real_scale( V[j+1], w, (float)(1/H[j][j+1]), start, end, l );
+    vector_float_real_scale( &V[j+1], w, (float)(1/H[j][j+1]), start, end, l );
 }
 
 
-void compute_solution_MP( vector_float x, vector_float *V, complex_double *y,
+void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w,
+                      complex_double **H, complex_double* buffer, int j, void (*prec)(),
+                      gmres_float_struct *p, level_struct *l, struct Thread *threading ) {
+  
+  SYNC_MASTER_TO_ALL(threading)
+  SYNC_CORES(threading)
+  int i, n_vect=g.num_rhs_vect, n, jj;
+  double H_tot;
+  complex_float H_float[n_vect];
+  // start and end indices for vector functions depending on thread
+  int start;
+  int end;
+  // compute start and end indices for core
+  // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
+  //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
+  
+  if ( prec != NULL ) {
+    if ( p->kind == _LEFT ) {
+      apply_operator_float( &Z[0], &V[j], p, l, threading );
+      prec( w, NULL, &Z[0], _NO_RES, l, threading );
+    } else {
+      if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
+        prec( &Z[j], w, &V[j], _NO_RES, l, threading );
+        // obtains w = D * Z[j] from Schwarz
+      } else {
+        prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+        apply_operator_float( w, &Z[j], p, l, threading ); // w = D*Z[j]
+      }
+    }
+  } else {
+    apply_operator_float( w, &V[j], p, l, threading ); // w = D*V[j]
+  }
+
+  complex_double tmp[(j+1)*n_vect];
+  process_multi_inner_product_MP_new( j+1, tmp, V, w, l, threading );
+  START_MASTER(threading)
+  for( i=0; i<=j; i++ )
+    VECTOR_LOOP(n, n_vect, jj, buffer[i*n_vect+n+jj] = tmp[i*n_vect+n+jj];)
+
+  if ( g.num_processes > 1 ) {
+    PROF_double_START( _ALLR );
+    MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm );
+    PROF_double_STOP( _ALLR, 1 );
+  } else {
+    for( i=0; i<=j; i++ )
+      VECTOR_LOOP(n, n_vect, jj, H[j][i*n_vect+n+jj] = buffer[i*n_vect+n+jj];)
+  }
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+
+  complex_float alpha[(j+1)*n_vect]; 
+  for( i=0; i<=j; i++ )
+    VECTOR_LOOP(n, n_vect, jj, alpha[i*n_vect+n+jj] = (complex_float) H[j][i*n_vect+n+jj];)
+  for( i=0; i<=j; i++ )
+    vector_float_saxpy_new( w, w, &V[i], alpha, i, -1, l, threading );
+  /*// orthogonalization
+  complex_float alpha[(j+1)*n_vect];
+ 
+  for( i=0; i<=j; i++ )
+    for( n_vec=0; n_vec<n_vect; n_vec++ )
+      alpha[i*n_vect+n_vec] = (complex_float) -H[j][i*n_vect+n_vec];
+  vector_float_multi_saxpy_new( w, V, alpha, 1, j+1, l, threading );
+  */
+  double tmp2[n_vect];
+  global_norm_MP_new( tmp2, w, l, threading );
+  START_MASTER(threading)
+  VECTOR_LOOP(n, n_vect, jj, H[j][(j+1)*n_vect+n+jj] = tmp2[n+jj];)
+
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+  
+  // V_j+1 = w / H_j+1,j
+  H_tot=0;
+  VECTOR_LOOP(n, n_vect, jj, H_tot += cabs_double( H[j][(j+1)*n_vect+n+jj] );)
+  
+  if ( H_tot > n_vect*1e-15 ){
+    VECTOR_LOOP(n, n_vect, jj, H_float[n+jj]= (complex_float) H[j][(j+1)*n_vect+n+jj];)
+   vector_float_real_scale_new( &V[j+1], w, H_float, 0, 1, l, threading );
+  }
+}
+
+
+void compute_solution_MP( vector_float *x, vector_float *V, complex_double *y,
                           complex_double *gamma, complex_double **H, int j,
                           gmres_float_struct *p, level_struct *l, struct Thread *threading ) {
   
@@ -418,12 +531,57 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y,
   SYNC_MASTER_TO_ALL(threading)
   
   // x = V*y
-  vector_float_scale( x, V[0], (complex_float) y[0], start, end, l );
+  vector_float_scale( x, &V[0], (complex_float) y[0], start, end, l );
 
   complex_float alpha[j];
   for ( i=1; i<=j; i++ )
     alpha[i-1] = (complex_float) y[i];
-  vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, start, end, l );
+  vector_float_multi_saxpy( x, &V[1], alpha, 1, j, start, end, l );
 }
 
 
+void compute_solution_MP_new( vector_float *x, vector_float *V, complex_double *y,
+                          complex_double *gamma, complex_double **H, int j,
+                          gmres_float_struct *p, level_struct *l, struct Thread *threading ) {
+  
+  int i, k, n, jj, n_vect=g.num_rhs_vect;
+  complex_float y_float[n_vect];
+  // start and end indices for vector functions depending on thread
+  //int start;
+  //int end;
+  // compute start and end indices for core
+  // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
+  //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
+
+  START_MASTER(threading)
+  
+  PROF_double_START( _SMALL2 );
+  
+  // backward substitution
+  for ( i=j; i>=0; i-- ) {
+    VECTOR_LOOP(n, n_vect, jj, y[i*n_vect+n+jj] = gamma[i*n_vect+n+jj];)
+    for ( k=i+1; k<=j; k++ ) {
+      for ( n=0; n<n_vect; n++ )
+        y[i*n_vect+n] -= H[k][i*n_vect+n]*y[k*n_vect+n];
+    }
+    VECTOR_LOOP(n, n_vect, jj, y[i*n_vect+n+jj] /= H[i][i*n_vect+n+jj];)
+  }
+  
+  PROF_double_STOP( _SMALL2, ((j+1)*(j+2))/2 + j+1 );
+  
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+  
+  // x = V*y
+  VECTOR_LOOP(n, n_vect, jj, y_float[n+jj]= (complex_float) y[0*n_vect+n+jj];)
+  vector_float_scale_new( x, &V[0], y_float, 0, l, threading );
+
+  complex_float alpha[j*n_vect];
+  for ( i=1; i<=j; i++ )
+    VECTOR_LOOP(n, n_vect, jj, alpha[i*n_vect+n+jj] = (complex_float) y[i*n_vect+n+jj];)
+  for ( i=1; i<=j; i++ )
+    vector_float_saxpy_new( x, x, &V[i], alpha, i, 1, l, threading );
+//vector_float_multi_saxpy_new( x, &V[1], alpha, 1, j, l, threading );
+
+}
+
diff --git a/src/linsolve.h b/src/linsolve.h
index 8ea985b..1011ea4 100644
--- a/src/linsolve.h
+++ b/src/linsolve.h
@@ -27,18 +27,24 @@
 
   struct Thread;
                                      
-  void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
+  void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float *w,
                         complex_double **H, complex_double* buffer, int j, void (*prec)(),
                         gmres_float_struct *p, level_struct *l, struct Thread *threading );
-                      
-  void compute_solution_MP( vector_float x, vector_float *V, complex_double *y,
+  void arnoldi_step_MP_new( vector_float *V, vector_float *Z, vector_float *w,
+                        complex_double **H, complex_double* buffer, int j, void (*prec)(),
+                        gmres_float_struct *p, level_struct *l, struct Thread *threading );
+                     
+  void compute_solution_MP( vector_float *x, vector_float *V, complex_double *y,
                             complex_double *gamma, complex_double **H, int j,
                             gmres_float_struct *p, level_struct *l, struct Thread *threading );
-  
+  void compute_solution_MP_new( vector_float *x, vector_float *V, complex_double *y,
+                            complex_double *gamma, complex_double **H, int j,
+                            gmres_float_struct *p, level_struct *l, struct Thread *threading );  
+     
   void fgmres_MP_struct_init( gmres_MP_struct *p );
-  void fgmres_MP_struct_alloc( int m, int n, long int vl, double tol, const int prec_kind,
+  void fgmres_MP_struct_alloc( int m, int n, const int vl_type, double tol, const int prec_kind,
                                void (*precond)(), gmres_MP_struct *p, level_struct* l );
-  void fgmres_MP_struct_free( gmres_MP_struct *p );
+  void fgmres_MP_struct_free( gmres_MP_struct *p,  level_struct *l );
   
   int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading );
   
diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c
index c981963..b657a7b 100644
--- a/src/linsolve_generic.c
+++ b/src/linsolve_generic.c
@@ -31,10 +31,10 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) {
   p->Z = NULL;
   p->V = NULL;
   p->H = NULL;
-  p->x = NULL;
-  p->b = NULL;
-  p->r = NULL;
-  p->w = NULL;
+  vector_PRECISION_init(&(p->x));
+  vector_PRECISION_init(&(p->b));
+  vector_PRECISION_init(&(p->r));
+  vector_PRECISION_init(&(p->w));
   p->y = NULL;
   p->gamma = NULL;
   p->c = NULL;
@@ -44,7 +44,7 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) {
 }
 
 
-void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, const int type, const int prec_kind,
+void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION tol, const int type, const int prec_kind,
                                     void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l ) {
 
 /*********************************************************************************
@@ -62,7 +62,7 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co
 *********************************************************************************/  
   
   long int total=0; 
-  int i, k=0;
+  int i, k=0, n_vl=g.num_rhs_vect;
   
   p->restart_length = m;
   p->num_restart = n;
@@ -72,38 +72,34 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co
   p->kind = prec_kind;
 
 #ifdef HAVE_TM1p1
-  vl*=2;
+  n_vl*=2;
 #endif
   
   if(m > 0) {
-  total += (m+1)*m; // Hessenberg matrix
+  total += (m+1)*m*n_vl; // Hessenberg matrix
   MALLOC( p->H, complex_PRECISION*, m );
   
-  total += (5+m)*vl; // x, r, b, w, V
-  MALLOC( p->V, complex_PRECISION*, m+1 );
+  MALLOC( p->V, vector_PRECISION, m+1 );
   
   if ( precond != NULL ) {
     if ( prec_kind == _RIGHT ) {
-      total += (m+1)*vl; // Z
       k = m+1;
     } else {
-      total += vl;
       k = 1;
     }
-    MALLOC( p->Z, complex_PRECISION*, k );
+    MALLOC( p->Z, vector_PRECISION, k );
   } else {
 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI)
     if ( l->level == 0 && l->depth > 0 ) {
-      total += (m+2)*vl;
       k = m+2;
-      MALLOC( p->Z, complex_PRECISION*, k );
+      MALLOC( p->Z, vector_PRECISION, k );
     }
 #else
     k = 0;
 #endif
   }
   
-  total += 4*(m+1); // y, gamma, c, s
+  total += 4*(m+1)*n_vl; // y, gamma, c, s
   
   p->H[0] = NULL; // allocate connected memory
   MALLOC( p->H[0], complex_PRECISION, total );
@@ -114,34 +110,36 @@ void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, co
   // ordering: H, y, gamma, c, s, w, V, Z, x, r, b
   // H
   for ( i=1; i<m; i++ )
-    p->H[i] = p->H[0] + i*(m+1);
-  total += m*(m+1);
+    p->H[i] = p->H[0] + i*(m+1)*n_vl;
+  total += m*(m+1)*n_vl;
   
   // y
-  p->y = p->H[0] + total; total += m+1;
+  p->y = p->H[0] + total; total += (m+1)*n_vl;
   // gamma
-  p->gamma = p->H[0] + total; total += m+1;
+  p->gamma = p->H[0] + total; total += (m+1)*n_vl;
   // c
-  p->c = p->H[0] + total; total += m+1;
+  p->c = p->H[0] + total; total += (m+1)*n_vl;
   // s
-  p->s = p->H[0] + total; total += m+1;
+  p->s = p->H[0] + total; total += (m+1)*n_vl;
   // w
-  p->w = p->H[0] + total; total += vl;
+  vector_PRECISION_alloc( &(p->w), vl_type, n_vl, l, no_threading );
   // V
   for ( i=0; i<m+1; i++ ) {
-    p->V[i] = p->H[0] + total; total += vl;
+    vector_PRECISION_init(&(p->V[i]));
+    vector_PRECISION_alloc( &(p->V[i]), vl_type, n_vl, l, no_threading );
   }
   // Z
   for ( i=0; i<k; i++ ) {
-    p->Z[i] = p->H[0] + total; total += vl;
+    vector_PRECISION_init(&(p->Z[i]));
+    vector_PRECISION_alloc( &(p->Z[i]), vl_type, n_vl, l, no_threading );
   }
 
   // x
-  p->x = p->H[0] + total; total += vl;
+  vector_PRECISION_alloc( &(p->x), vl_type, n_vl, l, no_threading );
   // r
-  p->r = p->H[0] + total; total += vl;
+  vector_PRECISION_alloc( &(p->r), vl_type, n_vl, l, no_threading );
   // b
-  p->b = p->H[0] + total; total += vl;
+  vector_PRECISION_alloc( &(p->b), vl_type, n_vl, l, no_threading );
   
   ASSERT( p->total_storage == total );
   }
@@ -205,10 +203,14 @@ void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l )
   if(p->restart_length > 0) {
   FREE( p->H[0], complex_PRECISION, p->total_storage );
   FREE( p->H, complex_PRECISION*, p->restart_length );
-  FREE( p->V, complex_PRECISION*, p->restart_length+1 );
-  
+  FREE( p->V, vector_PRECISION, p->restart_length+1 );
+  vector_PRECISION_free( &(p->w), l, no_threading );
+  vector_PRECISION_free( &(p->x), l, no_threading );
+  vector_PRECISION_free( &(p->r), l, no_threading );
+  vector_PRECISION_free( &(p->b), l, no_threading );
+ 
   if ( p->Z != NULL )
-    FREE( p->Z, complex_PRECISION*, k );
+    FREE( p->Z, vector_PRECISION, k );
   }
   
   p->D = NULL;
@@ -228,11 +230,17 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
   int end;
 
   int j=-1, finish=0, iter=0, il, ol, res;
-  complex_PRECISION gamma0 = 0;
-
-  complex_PRECISION beta = 0;
+  int n_vect=g.num_rhs_vect, i, jj;
+  complex_PRECISION gamma0[n_vect];//gamma0 = 0;
+  
+  PRECISION beta[n_vect];//complex_PRECISION beta = 0;
 
-  PRECISION norm_r0=1, gamma_jp1=1, t0=0, t1=0;
+  double H_tot;
+  PRECISION norm_r0[n_vect], gamma_jp1[n_vect], gamma_tot, gamma0_real[n_vect], t0=0, t1=0;
+  
+  VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj]=1; 
+                             gamma_jp1[i+jj]=1;)
+  
   START_LOCKED_MASTER(threading)
 
   if ( l->depth==0 && ( p->timing || p->print ) ) prof_init( l );
@@ -247,47 +255,58 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
   SYNC_MASTER_TO_ALL(threading)
   // compute start and end indices for core
   // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
-  compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
-  
-  for( ol=0; ol<p->num_restart && finish==0; ol++ )  {
-  
+  //compute_core_start_end(p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, &start, &end, l, threading);
+
+  SYNC_CORES(threading)
+  for( ol=0; ol<p->num_restart && finish==0; ol++ ) {
     if( ol == 0 && p->initial_guess_zero ) {
       res = _NO_RES;
-      vector_PRECISION_copy( p->r, p->b, start, end, l );
+      //vector_PRECISION_copy( &(p->r), &(p->b), start, end, l );
+      vector_PRECISION_copy_new( &(p->r), &(p->b), l, threading );
     } else {
       res = _RES;
       if ( p->kind == _LEFT && p->preconditioner ) {
-        apply_operator_PRECISION( p->Z[0], p->x, p, l, threading );
+        apply_operator_PRECISION( &(p->Z[0]), &(p->x), p, l, threading );
         if ( g.method == 5 ) {
           START_LOCKED_MASTER(threading)
-          g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 );
+          //g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 );
           END_LOCKED_MASTER(threading)
         }
-        p->preconditioner( p->w, NULL, p->Z[0], _NO_RES, l, threading );
+        p->preconditioner( &(p->w), NULL, &(p->Z[0]), _NO_RES, l, threading );
       } else {
-        apply_operator_PRECISION( p->w, p->x, p, l, threading ); // compute w = D*x
+        apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading ); // compute w = D*x
       }
-      vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); // compute r = b - w
+      //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l ); // compute r = b - w
+      vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading );
     }
-    gamma0 = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r)
+    //gamma0 = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r)
+    global_norm_PRECISION_new( gamma0_real, &(p->r), l, threading );
+    
+    VECTOR_LOOP(i, n_vect, jj, gamma0[i+jj]=gamma0_real[i+jj];)
+
     START_MASTER(threading)
-    p->gamma[0] = gamma0;
+    //p->gamma[0] = gamma0;
+    VECTOR_LOOP(i, n_vect, jj, p->gamma[i+jj] = gamma0[i+jj];)
+    
     END_MASTER(threading);
     SYNC_MASTER_TO_ALL(threading);
     
     if ( ol == 0 ) {
      if (l->depth == 0 && !p->initial_guess_zero) {
-       norm_r0 = global_norm_PRECISION( p->b, p->v_start, p->v_end, l, threading );
-       printf0("| initial guess relative residual:            %le |\n", creal(gamma0)/norm_r0);
+       //norm_r0 = global_norm_PRECISION( &(p->b), p->v_start, p->v_end, l, threading );
+       global_norm_PRECISION_new( norm_r0, &(p->b), l, threading );
+       for( i=0; i<n_vect; i++ )
+         printf0("| initial guess relative residual (%d):        %le |\n", i, creal(gamma0[i])/norm_r0[i]);
      } else {
-       norm_r0 = creal(p->gamma[0]);
+       //norm_r0 = creal(p->gamma[0]);
+       VECTOR_LOOP(i, n_vect, jj, norm_r0[i+jj] = creal(p->gamma[i+jj]);)
      }
     }
-
-    vector_PRECISION_real_scale( p->V[0], p->r, 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0
+    //vector_PRECISION_real_scale( &(p->V[0]), &(p->r), 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0
+    vector_PRECISION_real_scale_new( &(p->V[0]), &(p->r), p->gamma, 0, 1, l, threading );
 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI)
     if ( l->level == 0 && l->depth > 0 ) {
-      arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p, l, threading );
+      arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, 0, p->preconditioner, p, l, threading );
     }
 #endif   
     
@@ -295,79 +314,96 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
       j = il; iter++;
       if ( g.method == 5 ) {
         START_LOCKED_MASTER(threading)
-        g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 );
+        //g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 );
         END_LOCKED_MASTER(threading)
       }
       
       // one step of Arnoldi
 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI)
       if ( l->level == 0 && l->depth > 0 ) {
-        if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) {
+        if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) {
           printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+2, j+1 );
           break;
         }
       } else {
-        if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) {
+        if ( !arnoldi_step_PRECISION( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) {
           printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j );
           break;
         }
       }
 #else
-      if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) {
+      if ( !arnoldi_step_PRECISION_new( p->V, p->Z, &(p->w), p->H, p->y, j, p->preconditioner, p, l, threading ) ) {
         printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j );
         break;
       }
 #endif
-      
-      if ( cabs( p->H[j][j+1] ) > p->tol/10 ) {
+      H_tot=0;
+      VECTOR_LOOP(i, n_vect, jj, H_tot += cabs( p->H[j][(j+1)*n_vect+i+jj] );)
+
+      //if ( cabs( p->H[j][j+1] ) > p->tol/10 )
+      if ( H_tot > n_vect*p->tol/10 ) {
         qr_update_PRECISION( p->H, p->s, p->c, p->gamma, j, l, threading );
-        gamma_jp1 = cabs( p->gamma[j+1] );
-        
+        //gamma_jp1 = cabs( p->gamma[(j+1)] );
+        VECTOR_LOOP(i, n_vect, jj, gamma_jp1[i+jj] = cabs( p->gamma[(j+1)*n_vect+i+jj] );)
+
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
         if ( iter%10 == 0 || p->preconditioner != NULL || l->depth > 0 ) {
           START_MASTER(threading)
           if ( p->print && g.print > 0 )
-            printf0("| approx. rel. res. after  %-6d iterations: %e |\n", iter, gamma_jp1/norm_r0 );
+            for( i=0; i<n_vect; i++ )
+              printf0("| vector %d, approx. rel. res. after  %-6d iterations: %e |\n", i, iter, gamma_jp1[i]/norm_r0[i] );
           END_MASTER(threading)
         }
 #endif
-        if( gamma_jp1/norm_r0 < p->tol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop
+        gamma_tot=0;
+        VECTOR_LOOP(i, n_vect, jj, gamma_tot += gamma_jp1[i+jj]/norm_r0[i+jj];)
+
+        //if( gamma_jp1/norm_r0 < p->tol || gamma_jp1/norm_r0 > 1E+5 )  // if satisfied ... stop
+        if( gamma_tot < n_vect*p->tol || gamma_tot > n_vect*1E+5 ) {
           finish = 1;
           START_MASTER(threading)
-          if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level );
+          if ( gamma_tot > n_vect*1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level );
           END_MASTER(threading)
         }
       } else {
-        printf0("depth: %d, iter: %d, p->H(%d,%d) = %+lf+%lfi\n", l->depth, iter, j+1, j, CSPLIT( p->H[j][j+1] ) );
+        for( i=0; i<n_vect; i++ )
+          printf0("vector: %d, depth: %d, iter: %d, p->H(%d,%d) = %+lf+%lfi\n", i, l->depth, iter, j+1, j, CSPLIT( p->H[j][(j+1)*n_vect+i] ) );
         finish = 1;
         break;
       }
     } // end of a single restart
-    compute_solution_PRECISION( p->x, (p->preconditioner&&p->kind==_RIGHT)?p->Z:p->V,
+    /*compute_solution_PRECISION( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V),
+                                p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading );*/
+    compute_solution_PRECISION_new( &(p->x), (p->preconditioner&&p->kind==_RIGHT)?(p->Z):(p->V),
                                 p->y, p->gamma, p->H, j, (res==_NO_RES)?ol:1, p, l, threading );
   } // end of fgmres
   
   START_LOCKED_MASTER(threading)
-  if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_jp1/norm_r0; }
+  if ( l->depth == 0 ) { t1 = MPI_Wtime(); g.total_time = t1-t0; g.iter_count = iter; g.norm_res = gamma_tot ; }//= gamma_jp1/norm_r0; }
   END_LOCKED_MASTER(threading)
   
   if ( p->print ) {
 #ifdef FGMRES_RESTEST
-    apply_operator_PRECISION( p->w, p->x, p, l, threading );
-    vector_PRECISION_minus( p->r, p->b, p->w, start, end, l );
-    beta = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading );
+    apply_operator_PRECISION( &(p->w), &(p->x), p, l, threading );
+    //vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), start, end, l );
+    vector_PRECISION_minus_new( &(p->r), &(p->b), &(p->w), l, threading );
+    //beta = global_norm_PRECISION( &(p->r), p->v_start+p->r.size*n_vec, p->v_end+p->r.size*n_vec, l, threading );
+    global_norm_PRECISION_new( beta, &(p->r), l, threading );
 #else
-    beta = gamma_jp1;
+    VECTOR_LOOP(i, n_vect, jj, beta[i+jj] = creal_PRECISION(gamma_jp1[i+jj]);)
 #endif
     START_MASTER(threading)
-    g.norm_res = creal(beta)/norm_r0;
+    //g.norm_res = creal(beta)/norm_r0;
+    g.norm_res = 0;
+    VECTOR_LOOP(i, n_vect, jj, g.norm_res += beta[i+jj]/norm_r0[i+jj];)
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
     if ( g.print > 0 ) printf0("+----------------------------------------------------------+\n\n");
 #endif
     printf0("+----------------------------------------------------------+\n");
     printf0("|       FGMRES iterations: %-6d coarse average: %-6.2lf   |\n", iter,
             ((double)g.coarse_iter_count)/((double)iter) );
-    printf0("| exact relative residual: ||r||/||b|| = %e      |\n", creal(beta)/norm_r0 );
+    for( i=0; i<n_vect; i++ )
+      printf0("| exact relative residual, %d: ||r||/||b|| = %e   |\n",i, beta[i]/norm_r0[i] );
     printf0("| elapsed wall clock time: %-8.4lf seconds                |\n", t1-t0 );
     if ( g.coarse_time > 0 ) 
       printf0("|        coarse grid time: %-8.4lf seconds (%04.1lf%%)        |\n",
@@ -383,7 +419,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
   if ( l->depth > 0 ) {
     START_MASTER(threading)
     char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number );
-    printf0(" - depth: %d, gmres iter: %2d, approx rel res: %le |", l->depth, iter, gamma_jp1/norm_r0 );
+    //printf0(" - depth: %d, gmres iter: %2d, approx rel res: %le |", l->depth, iter, gamma_jp1/norm_r0 );
     printf0("\033[0m\n"); fflush(0);
     END_MASTER(threading)
   }
@@ -411,8 +447,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
     START_MASTER(threading)
     if ( g.method != 6 ) prof_print( l );
     END_MASTER(threading)
-  }
-  
+  }  
   return iter;
 }
 
@@ -439,13 +474,13 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
   maxiter = 1000000; r = ps->r; b = ps->b; x = ps->x; p = ps->w;
   pp = ps->V[0]; r_tilde = ps->V[1]; v = ps->V[2]; s = ps->V[3]; t = ps->V[4];
   
-  vector_PRECISION_copy( r, b, start, end, l );
-  vector_PRECISION_copy( r_tilde, b, start, end, l );
-  vector_PRECISION_define( x, 0, start, end, l );
-  vector_PRECISION_define( v, 0, start, end, l );
-  vector_PRECISION_define( s, 0, start, end, l );
-  vector_PRECISION_define( t, 0, start, end, l );
-  b_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading );
+  vector_PRECISION_copy( &r, &b, start, end, l );
+  vector_PRECISION_copy( &r_tilde, &b, start, end, l );
+  vector_PRECISION_define( &x, 0, start, end, l );
+  vector_PRECISION_define( &v, 0, start, end, l );
+  vector_PRECISION_define( &s, 0, start, end, l );
+  vector_PRECISION_define( &t, 0, start, end, l );
+  b_norm = global_norm_PRECISION( &b, ps->v_start, ps->v_end, l, threading );
 
   r_norm = b_norm;
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)  
@@ -457,7 +492,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
     iter++;
     
     rho_old = rho;
-    rho = global_inner_product_PRECISION( r_tilde, r, ps->v_start, ps->v_end, l, threading );
+    rho = global_inner_product_PRECISION( &r_tilde, &r, ps->v_start, ps->v_end, l, threading );
 
     if ( rho == 0 ) {
       START_MASTER(threading)
@@ -467,31 +502,31 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
     }
     
     if ( iter == 1 ) {
-      vector_PRECISION_copy( p, r, start, end, l );
+      vector_PRECISION_copy( &p, &r, start, end, l );
     } else {
       beta = (rho/rho_old)*(alpha/omega);
-      vector_PRECISION_saxpy( pp, p,  v, -omega, start, end, l );
-      vector_PRECISION_saxpy( p,  r, pp,   beta, start, end, l );
+      vector_PRECISION_saxpy( &pp, &p,  &v, -omega, start, end, l );
+      vector_PRECISION_saxpy( &p,  &r, &pp  , beta, start, end, l );
     }    
-    apply_operator_PRECISION( v, p, ps, l, threading );
-    alpha = rho / global_inner_product_PRECISION( r_tilde, v, ps->v_start, ps->v_end, l, threading );
-    vector_PRECISION_saxpy( s, r, v, -alpha, start, end, l );
-    s_norm = global_norm_PRECISION( s, ps->v_start, ps->v_end, l, threading );
+    apply_operator_PRECISION( &v, &p, ps, l, threading );
+    alpha = rho / global_inner_product_PRECISION( &r_tilde, &v, ps->v_start, ps->v_end, l, threading );
+    vector_PRECISION_saxpy( &s, &r, &v, -alpha, start, end, l );
+    s_norm = global_norm_PRECISION( &s, ps->v_start, ps->v_end, l, threading );
 
     if ( s_norm/b_norm < tol ) {
-      vector_PRECISION_saxpy( x, x, p, alpha, start, end, l );
+      vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l );
       break;
     }
     
-    apply_operator_PRECISION( t, s, ps, l, threading );
-    omega = global_inner_product_PRECISION( t, s, ps->v_start, ps->v_end, l, threading )
-          / global_inner_product_PRECISION( t, t, ps->v_start, ps->v_end, l, threading );
+    apply_operator_PRECISION( &t, &s, ps, l, threading );
+    omega = global_inner_product_PRECISION( &t, &s, ps->v_start, ps->v_end, l, threading )
+          / global_inner_product_PRECISION( &t, &t, ps->v_start, ps->v_end, l, threading );
     
-    vector_PRECISION_saxpy( x, x, p,  alpha, start, end, l );
-    vector_PRECISION_saxpy( x, x, s,  omega, start, end, l );
-    vector_PRECISION_saxpy( r, s, t, -omega, start, end, l );
+    vector_PRECISION_saxpy( &x, &x, &p,  alpha, start, end, l );
+    vector_PRECISION_saxpy( &x, &x, &s,  omega, start, end, l );
+    vector_PRECISION_saxpy( &r, &s, &t, -omega, start, end, l );
 
-    r_norm = global_norm_PRECISION( r, ps->v_start, ps->v_end, l, threading );
+    r_norm = global_norm_PRECISION( &r, ps->v_start, ps->v_end, l, threading );
 
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
     START_MASTER(threading)
@@ -537,16 +572,15 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
   // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
   compute_core_start_end(ps->v_start, ps->v_end, &start, &end, l, threading);
 
-  vector_PRECISION_define( x, 0, start, end, l );
-  apply_operator_PRECISION( Dp, x, ps, l, threading );
-  vector_PRECISION_minus( pp, b, Dp, start, end, l );
-  apply_operator_dagger_PRECISION( r_old, pp, ps, l, threading );
+  vector_PRECISION_define( &x, 0, start, end, l );
+  apply_operator_PRECISION( &Dp, &x, ps, l, threading );
+  vector_PRECISION_minus( &pp, &b, &Dp, start, end, l );
+  apply_operator_dagger_PRECISION( &r_old, &pp, ps, l, threading );
   
-  vector_PRECISION_copy( p, r_old, start, end, l );
-  r0_norm = global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading );
+  vector_PRECISION_copy( &p, &r_old, start, end, l );
+  r0_norm = global_norm_PRECISION( &r_old, ps->v_start, ps->v_end, l, threading );
   //  prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading );
   prod_rr_old = r0_norm*r0_norm;
-
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
   if ( ps->print ) {
     START_MASTER(threading)
@@ -557,19 +591,19 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
   while ( sqrt(prod_rr_old) / r0_norm > tol && iter < maxiter ) {
     iter++;
     
-    apply_operator_PRECISION( pp, p, ps, l, threading );
-    apply_operator_dagger_PRECISION( Dp, pp, ps, l, threading );
+    apply_operator_PRECISION( &pp, &p, ps, l, threading );
+    apply_operator_dagger_PRECISION( &Dp, &pp, ps, l, threading );
     
-    gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading );
+    gamma = global_inner_product_PRECISION( &p, &Dp, ps->v_start, ps->v_end, l, threading );
     alpha = prod_rr_old / gamma;
-    vector_PRECISION_saxpy( x, x, p, alpha, start, end, l );
-    vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l );
+    vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l );
+    vector_PRECISION_saxpy( &r_new, &r_old, &Dp, -alpha, start, end, l );
     
-    gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading );
+    gamma = global_inner_product_PRECISION( &r_new, &r_new, ps->v_start, ps->v_end, l, threading );
     beta = gamma / prod_rr_old;
     
-    vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l );
-    vector_PRECISION_copy( r_old, r_new, start, end, l );
+    vector_PRECISION_saxpy( &p, &r_new, &p, beta, start, end, l );
+    vector_PRECISION_copy( &r_old, &r_new, start, end, l );
     prod_rr_old = gamma;
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)    
     if ( iter%100 == 0 && ps->print >=1 ) {
@@ -580,10 +614,10 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
 #endif
   }
   
-  r0_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading );
-  apply_operator_PRECISION( Dp, x, ps, l, threading );
-  vector_PRECISION_minus( r_true, b, Dp, start, end, l );
-  r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading );
+  r0_norm = global_norm_PRECISION( &b, ps->v_start, ps->v_end, l, threading );
+  apply_operator_PRECISION( &Dp, &x, ps, l, threading );
+  vector_PRECISION_minus( &r_true, &b, &Dp, start, end, l );
+  r_norm = global_norm_PRECISION( &r_true, ps->v_start, ps->v_end, l, threading );
 
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)  
   if ( ps->print ) {
@@ -598,22 +632,22 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
   while ( r_norm / r0_norm > tol && iter < maxiter ) {
     iter++;
     
-    apply_operator_PRECISION( pp, p, ps, l, threading );
-    apply_operator_dagger_PRECISION( Dp, pp, ps, l, threading );
+    apply_operator_PRECISION( &pp, &p, ps, l, threading );
+    apply_operator_dagger_PRECISION( &Dp, &pp, ps, l, threading );
     
-    gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading );
+    gamma = global_inner_product_PRECISION( &p, &Dp, ps->v_start, ps->v_end, l, threading );
     alpha = prod_rr_old / gamma;
-    vector_PRECISION_saxpy( x, x, p, alpha, start, end, l );
-    vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l );
+    vector_PRECISION_saxpy( &x, &x, &p, alpha, start, end, l );
+    vector_PRECISION_saxpy( &r_new, &r_old, &Dp, -alpha, start, end, l );
     
     // residual update
-    vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, start, end, l );
-    r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading );
-    gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading );
+    vector_PRECISION_saxpy( &r_true, &r_true, &pp, -alpha, start, end, l );
+    r_norm = global_norm_PRECISION( &r_true, ps->v_start, ps->v_end, l, threading );
+    gamma = global_inner_product_PRECISION( &r_new, &r_new, ps->v_start, ps->v_end, l, threading );
     beta = gamma / prod_rr_old;
     
-    vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l );
-    vector_PRECISION_copy( r_old, r_new, start, end, l );
+    vector_PRECISION_saxpy( &p, &r_new, &p, beta, start, end, l );
+    vector_PRECISION_copy( &r_old, &r_new, start, end, l );
     prod_rr_old = gamma;
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)    
     if ( iter%100 ==  0 && ps->print >=1 ) {
@@ -630,10 +664,10 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
     printf0("+----------------------------------------------------------+\n");
     printf0("|          CGN iterations: %-6d                          |\n", iter );
     END_MASTER(threading)
-    apply_operator_PRECISION( Dp, x, ps, l, threading );
-    vector_PRECISION_minus( pp, b, Dp, start, end, l );
+    apply_operator_PRECISION( &Dp, &x, ps, l, threading );
+    vector_PRECISION_minus( &pp, &b, &Dp, start, end, l );
 
-    beta = global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading );
+    beta = global_norm_PRECISION( &pp, ps->v_start, ps->v_end, l, threading );
     START_MASTER(threading)
     if ( ps->timing ) printf0("| exact relative residual: ||r||/||b|| = %e      |\n", creal(beta/r0_norm) );
     printf0("| elapsed wall clock time: %-12g seconds            |\n", t1-t0 );
@@ -658,15 +692,15 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
 }
 
 
-int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w,
+int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w,
                             complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
                             gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
 * Extends the Arnoldi basis by one vector.
-* - vector_PRECISION *V: Contains the Arnoldi basis vectors.
-* - vector_PRECISION *Z: If a right precond. P is used, contains P*V[j] for all j.
-* - vector_PRECISION w: Will be appended to existing Arnoldi basis at 
+* - vector_PRECISION **V: Contains the Arnoldi basis vectors.
+* - vector_PRECISION **Z: If a right precond. P is used, contains P*V[j] for all j.
+* - vector_PRECISION *w: Will be appended to existing Arnoldi basis at 
 *   position j+1.
 * - complex_PRECISION **H: Contains full Hessenberg matrix from the Arnoldi 
 *   decomposition (columnmajor!)
@@ -688,12 +722,12 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
 
     if ( j == 0 )
-      vector_PRECISION_copy( Z[0], V[0], start, end, l );
+      vector_PRECISION_copy( &Z[0], &V[0], start, end, l );
     else
-      vector_PRECISION_copy( V[j], Z[j], start, end, l );
+      vector_PRECISION_copy( &V[j], &Z[j], start, end, l );
 
     complex_PRECISION tmp[j+1];
-    process_multi_inner_product_PRECISION( j+1, tmp, V, V[j], p->v_start, p->v_end, l, threading );
+    process_multi_inner_product_PRECISION( j+1, tmp, V, &V[j], p->v_start, p->v_end, l, threading );
     START_MASTER(threading)
     PROF_PRECISION_START( _ALLR );
     for( i=0; i<=j; i++ )
@@ -708,7 +742,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     PROF_PRECISION_STOP( _ALLR, 1 );
     END_MASTER(threading)
     
-    apply_operator_PRECISION( Z[j+1], Z[j], p, l, threading );
+    apply_operator_PRECISION( &Z[j+1], &Z[j], p, l, threading );
     
     START_MASTER(threading)
     PROF_PRECISION_START( _ALLR );
@@ -725,8 +759,8 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     SYNC_MASTER_TO_ALL(threading) 
     
     for( i=0; i<j; i++ )
-      vector_PRECISION_saxpy( V[j], V[j], V[i], -H[j-1][i], start, end, l );
-    vector_PRECISION_real_scale( V[j], V[j], 1/H[MAX(0,j-1)][j], start, end, l );
+      vector_PRECISION_saxpy( &V[j], &V[j], &V[i], -H[j-1][i], start, end, l );
+    vector_PRECISION_real_scale( &V[j], &V[j], 1/H[MAX(0,j-1)][j], start, end, l );
     
     START_MASTER(threading)
     if ( j > 0 ) {
@@ -736,13 +770,13 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     SYNC_MASTER_TO_ALL(threading)
     
     if ( j == 0 ) {
-      if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, start, end, l );
+      if ( sigma ) vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[j], -sigma, start, end, l );
     } else {
       for( i=0; i<j; i++ )
-        vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[i+1], -H[j-1][i], start, end, l );
+        vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[i+1], -H[j-1][i], start, end, l );
     }
     
-    vector_PRECISION_real_scale( Z[j+1], Z[j+1], 1/H[MAX(0,j-1)][j], start, end, l );
+    vector_PRECISION_real_scale( &Z[j+1], &Z[j+1], 1/H[MAX(0,j-1)][j], start, end, l );
 
   } else {
 #endif
@@ -754,32 +788,32 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
 
     if ( prec != NULL ) {
       if ( p->kind == _LEFT ) {
-        apply_operator_PRECISION( Z[0], V[j], p, l, threading );
-        prec( V[j+1], NULL, Z[0], _NO_RES, l, threading );
-        if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l );
+        apply_operator_PRECISION( &Z[0], &V[j], p, l, threading );
+        prec( &V[j+1], NULL, &Z[0], _NO_RES, l, threading );
+        if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l );
       } else {
         if ( l->level == 0 ) {
-          prec( Z[j], NULL, V[j], _NO_RES, l, threading );
-          apply_operator_PRECISION( V[j+1], Z[j], p, l, threading );
+          prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+          apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading );
         } else {
           if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
-            prec( Z[j], V[j+1], V[j], _NO_RES, l, threading );
+            prec( &Z[j], &V[j+1], &V[j], _NO_RES, l, threading );
             // obtains w = D * Z[j] from Schwarz
           } else {
-            prec( Z[j], NULL, V[j], _NO_RES, l, threading );
-            apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); // w = D*Z[j]
+            prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+            apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); // w = D*Z[j]
           }
         }
-        if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l );
+        if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l );
 
       }
     } else {
-      apply_operator_PRECISION( V[j+1], V[j], p, l, threading ); // w = D*V[j]
-      if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l );
+      apply_operator_PRECISION( &V[j+1], &V[j], p, l, threading ); // w = D*V[j]
+      if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l );
     }
     
     complex_PRECISION tmp[j+2];
-    process_multi_inner_product_PRECISION( j+2, tmp, V, V[j+1], p->v_start, p->v_end, l, threading );
+    process_multi_inner_product_PRECISION( j+2, tmp, V, &V[j+1], p->v_start, p->v_end, l, threading );
     START_MASTER(threading)
     for( i=0; i<=j+1; i++ )
       buffer[i] = tmp[i];
@@ -804,8 +838,8 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     SYNC_MASTER_TO_ALL(threading)
 
     for( i=0; i<=j; i++ )
-      vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], start, end, l );
-    vector_PRECISION_real_scale( V[j+1], V[j+1], 1/H[j][j+1], start, end, l );
+      vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[i], -H[j][i], start, end, l );
+    vector_PRECISION_real_scale( &V[j+1], &V[j+1], 1/H[j][j+1], start, end, l );
     START_LOCKED_MASTER(threading)
     H[j][j] += sigma;
     END_LOCKED_MASTER(threading)
@@ -824,23 +858,23 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
 
   if ( prec != NULL ) {
     if ( p->kind == _LEFT ) {
-      apply_operator_PRECISION( Z[0], V[j], p, l, threading );
-      prec( w, NULL, Z[0], _NO_RES, l, threading );
+      apply_operator_PRECISION( &Z[0], &V[j], p, l, threading );
+      prec( w, NULL, &Z[0], _NO_RES, l, threading );
     } else {
       if ( l->level == 0 ) { 
-        apply_operator_PRECISION( w, Z[j], p, l, threading );
+        apply_operator_PRECISION( w, &Z[j], p, l, threading );
       } else {
         if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
-          prec( Z[j], w, V[j], _NO_RES, l, threading );
+          prec( &Z[j], w, &V[j], _NO_RES, l, threading );
           // obtains w = D * Z[j] from Schwarz
         } else {
-          prec( Z[j], NULL, V[j], _NO_RES, l, threading );
-          apply_operator_PRECISION( w, Z[j], p, l, threading ); // w = D*Z[j]
+          prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+          apply_operator_PRECISION( w, &Z[j], p, l, threading ); // w = D*Z[j]
         }
       }
     }
   } else {
-    apply_operator_PRECISION( w, V[j], p, l, threading ); // w = D*V[j]
+    apply_operator_PRECISION( w, &V[j], p, l, threading ); // w = D*V[j]
   }
 
   // orthogonalization
@@ -860,7 +894,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
   END_MASTER(threading)
   SYNC_MASTER_TO_ALL(threading)
   for( i=0; i<=j; i++ )
-    vector_PRECISION_saxpy( w, w, V[i], -H[j][i], start, end, l );
+    vector_PRECISION_saxpy( w, w, &V[i], -H[j][i], start, end, l );
 #ifdef REORTH
   // re-orthogonalization
   process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start, p->v_end, l, threading );
@@ -879,7 +913,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
   END_MASTER(threading)
   SYNC_MASTER_TO_ALL(threading)
   for( i=0; i<=j; i++ )
-    vector_PRECISION_saxpy( w, w, V[i], -tmp[i], start, end, l );
+    vector_PRECISION_saxpy( w, w, &V[i], -tmp[i], start, end, l );
 #endif
   
   // normalization
@@ -891,7 +925,256 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
   
   // V_j+1 = w / H_j+1,j
   if ( cabs_PRECISION( H[j][j+1] ) > 1e-15 )
-    vector_PRECISION_real_scale( V[j+1], w, 1/H[j][j+1], start, end, l );
+    vector_PRECISION_real_scale( &V[j+1], w, 1/H[j][j+1], start, end, l );
+#endif
+  return 1;
+}
+
+
+
+int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w,
+                            complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
+                            gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
+
+/*********************************************************************************
+* Extends the Arnoldi basis by one vector.
+* - vector_PRECISION **V: Contains the Arnoldi basis vectors.
+* - vector_PRECISION **Z: If a right precond. P is used, contains P*V[j] for all j.
+* - vector_PRECISION *w: Will be appended to existing Arnoldi basis at 
+*   position j+1.
+* - complex_PRECISION **H: Contains full Hessenberg matrix from the Arnoldi 
+*   decomposition (columnmajor!)
+* - complex_PRECISION* buffer: Buffer for local inner products.
+* - int j: index of the new Arnoldi vector to be orthonormalized
+*   against all previous ones.
+* - void (*prec)(): Function pointer to preconditioner (can be NULL if no 
+*   preconditioning is used).
+*********************************************************************************/
+#ifdef SINGLE_ALLREDUCE_ARNOLDI
+#ifdef PIPELINED_ARNOLDI
+  if ( l->level == 0 && l->depth > 0 ) {
+    SYNC_MASTER_TO_ALL(threading)
+    SYNC_CORES(threading)
+    MPI_Request req;
+    MPI_Status stat;
+    int start, end, i;
+    const complex_PRECISION sigma = 0;
+    compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
+
+    if ( j == 0 )
+      vector_PRECISION_copy( &Z[0], &V[0], start, end, l );
+    else
+      vector_PRECISION_copy( &V[j], &Z[j], start, end, l );
+
+    complex_PRECISION tmp[j+1];
+    process_multi_inner_product_PRECISION( j+1, tmp, V, &V[j], p->v_start, p->v_end, l, threading );
+    START_MASTER(threading)
+    PROF_PRECISION_START( _ALLR );
+    for( i=0; i<=j; i++ )
+      buffer[i] = tmp[i];
+    if ( g.num_processes > 1 ) {
+      MPI_Iallreduce( buffer, H[MAX(0,j-1)], j+1, MPI_COMPLEX_PRECISION, MPI_SUM,
+                      (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm, &req );
+    } else {
+      for( i=0; i<=j; i++ )
+        H[MAX(0,j-1)][i] = buffer[i];
+    }
+    PROF_PRECISION_STOP( _ALLR, 1 );
+    END_MASTER(threading)
+    
+    apply_operator_PRECISION( &Z[j+1], &Z[j], p, l, threading );
+    
+    START_MASTER(threading)
+    PROF_PRECISION_START( _ALLR );
+    if ( g.num_processes > 1 ) {
+      MPI_Wait( &req, &stat );
+    }
+    PROF_PRECISION_STOP( _ALLR, 0 );
+    if ( j > 0 ) {
+      for ( i=0; i<j; i++ )
+        H[j-1][j] -= conj( H[j-1][i] )*H[j-1][i];
+    }
+    H[MAX(0,j-1)][j] = sqrt( creal( H[MAX(0,j-1)][j] ) );
+    END_MASTER(threading)
+    SYNC_MASTER_TO_ALL(threading) 
+    
+    for( i=0; i<j; i++ )
+      vector_PRECISION_saxpy( &V[j], &V[j], &V[i], -H[j-1][i], start, end, l );
+    vector_PRECISION_real_scale( &V[j], &V[j], 1/H[MAX(0,j-1)][j], start, end, l );
+    
+    START_MASTER(threading)
+    if ( j > 0 ) {
+      H[j-1][j-1] += sigma;
+    }
+    END_MASTER(threading)
+    SYNC_MASTER_TO_ALL(threading)
+    
+    if ( j == 0 ) {
+      if ( sigma ) vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[j], -sigma, start, end, l );
+    } else {
+      for( i=0; i<j; i++ )
+        vector_PRECISION_saxpy( &Z[j+1], &Z[j+1], &Z[i+1], -H[j-1][i], start, end, l );
+    }
+    
+    vector_PRECISION_real_scale( &Z[j+1], &Z[j+1], 1/H[MAX(0,j-1)][j], start, end, l );
+
+  } else {
+#endif
+    SYNC_MASTER_TO_ALL(threading)
+    SYNC_CORES(threading)
+    int start, end, i;
+    const complex_PRECISION sigma = 0;
+    compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
+
+    if ( prec != NULL ) {
+      if ( p->kind == _LEFT ) {
+        apply_operator_PRECISION( &Z[0], &V[j], p, l, threading );
+        prec( &V[j+1], NULL, &Z[0], _NO_RES, l, threading );
+        if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l );
+      } else {
+        if ( l->level == 0 ) {
+          prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+          apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading );
+        } else {
+          if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
+            prec( &Z[j], &V[j+1], &V[j], _NO_RES, l, threading );
+            // obtains w = D * Z[j] from Schwarz
+          } else {
+            prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+            apply_operator_PRECISION( &V[j+1], &Z[j], p, l, threading ); // w = D*Z[j]
+          }
+        }
+        if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l );
+
+      }
+    } else {
+      apply_operator_PRECISION( &V[j+1], &V[j], p, l, threading ); // w = D*V[j]
+      if ( sigma ) vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[j], -sigma, start, end, l );
+    }
+    
+    complex_PRECISION tmp[j+2];
+    process_multi_inner_product_PRECISION( j+2, tmp, V, &V[j+1], p->v_start, p->v_end, l, threading );
+    START_MASTER(threading)
+    for( i=0; i<=j+1; i++ )
+      buffer[i] = tmp[i];
+    
+    if ( g.num_processes > 1 ) {
+      PROF_PRECISION_START( _ALLR );
+      MPI_Allreduce( buffer, H[j], j+2, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
+      PROF_PRECISION_STOP( _ALLR, 1 );
+    } else {
+      for( i=0; i<=j+1; i++ )
+        H[j][i] = buffer[i];
+    }  
+    for ( i=0; i<=j; i++ )
+      H[j][j+1] -= conj( H[j][i] )*H[j][i];
+    END_MASTER(threading)
+    SYNC_MASTER_TO_ALL(threading)
+    if ( creal( H[j][j+1] ) < 0 )
+      return 0;
+    START_MASTER(threading)
+    H[j][j+1] = sqrt( creal( H[j][j+1] ) );
+    END_MASTER(threading)
+    SYNC_MASTER_TO_ALL(threading)
+
+    for( i=0; i<=j; i++ )
+      vector_PRECISION_saxpy( &V[j+1], &V[j+1], &V[i], -H[j][i], start, end, l );
+    vector_PRECISION_real_scale( &V[j+1], &V[j+1], 1/H[j][j+1], start, end, l );
+    START_LOCKED_MASTER(threading)
+    H[j][j] += sigma;
+    END_LOCKED_MASTER(threading)
+#ifdef PIPELINED_ARNOLDI
+  }
+#endif
+#else
+  SYNC_MASTER_TO_ALL(threading)
+  SYNC_CORES(threading)
+  int i, n_vect=g.num_rhs_vect, n, jj;
+  PRECISION H_tot;
+  // start and end indices for vector functions depending on thread
+  int start, end;
+  // compute start and end indices for core
+  // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
+  //compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
+
+  if ( prec != NULL ) {
+    if ( p->kind == _LEFT ) {
+      apply_operator_PRECISION( &Z[0], &V[j], p, l, threading );
+      prec( w, NULL, &Z[0], _NO_RES, l, threading );
+    } else {
+      if ( l->level == 0 ) { 
+        apply_operator_PRECISION( w, &Z[j], p, l, threading );
+      } else {
+        if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
+          prec( &Z[j], w, &V[j], _NO_RES, l, threading );
+          // obtains w = D * Z[j] from Schwarz
+        } else {
+          prec( &Z[j], NULL, &V[j], _NO_RES, l, threading );
+          apply_operator_PRECISION( w, &Z[j], p, l, threading ); // w = D*Z[j]
+        }
+      }
+    }
+  } else {
+    apply_operator_PRECISION( w, &V[j], p, l, threading ); // w = D*V[j]
+  }
+
+  // orthogonalization
+  complex_PRECISION tmp[(j+1)*n_vect];
+  process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading );
+  START_MASTER(threading)
+  for( i=0; i<=j; i++ )
+    VECTOR_LOOP(n, n_vect, jj, buffer[i*n_vect+n+jj] = tmp[i*n_vect+n+jj];)
+
+  if ( g.num_processes > 1 ) {
+    PROF_PRECISION_START( _ALLR );
+    MPI_Allreduce( buffer, H[j], (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
+    PROF_PRECISION_STOP( _ALLR, 1 );
+  } else {
+    for( i=0; i<=j; i++ )
+      VECTOR_LOOP(n, n_vect, jj, H[j][i*n_vect+n+jj] = buffer[i*n_vect+n+jj];)
+  }
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+  for( i=0; i<=j; i++ )
+    vector_PRECISION_saxpy_new( w, w, &V[i], H[j], i, -1, l, threading );
+
+#ifdef REORTH
+  // re-orthogonalization
+  process_multi_inner_product_PRECISION_new( j+1, tmp, V, w, l, threading );
+  START_MASTER(threading)
+  for( i=0; i<=j; i++ )
+    VECTOR_LOOP(n, n_vect, jj, buffer[i*n_vect+n+jj] = tmp[i*n_vect+n+jj];)
+  
+  if ( g.num_processes > 1 ) {
+    PROF_PRECISION_START( _ALLR );
+    MPI_Allreduce( buffer, tmp, (j+1)*n_vect, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
+    PROF_PRECISION_STOP( _ALLR, 1 );
+  }
+  
+  for( i=0; i<=j; i++ )
+    VECTOR_LOOP(n, n_vect, jj, H[j][i*n_vect+n+jj] += tmp[i*n_vect+n+jj];)
+
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+  for( i=0; i<=j; i++ )
+    vector_PRECISION_saxpy_new( w, w, &V[i], tmp, i, -1, l, threading );
+#endif
+  
+  // normalization
+  PRECISION tmp2[n_vect]; 
+  global_norm_PRECISION_new( tmp2, w, l, threading );
+  START_MASTER(threading)
+
+  VECTOR_LOOP(n, n_vect, jj, H[j][(j+1)*n_vect+n+jj] = tmp2[n+jj];)
+ 
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+  
+  // V_j+1 = w / H_j+1,j
+  H_tot=0;
+  VECTOR_LOOP(n, n_vect, jj, H_tot += cabs_PRECISION( p->H[j][(j+1)*n_vect+n+jj] );)
+  if ( H_tot > n_vect*1e-15 )
+    vector_PRECISION_real_scale_new( &V[j+1], w, H[j], j+1, 1, l, threading );
 #endif
   return 1;
 }
@@ -917,23 +1200,29 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s,
   
   PROF_PRECISION_START( _SMALL1 );
   
-  int i;
-  complex_PRECISION beta;
+  int i, n, jj, n_vect=g.num_rhs_vect;
+  complex_PRECISION beta[n_vect];
   
   // update QR factorization
   // apply previous Givens rotation
-  for ( i=0; i<j; i++ ) {
-    beta = (-s[i])*H[j][i] + (c[i])*H[j][i+1];
-    H[j][i] = conj_PRECISION(c[i])*H[j][i] + conj_PRECISION(s[i])*H[j][i+1];
-    H[j][i+1] = beta;
-  }
+  for ( i=0; i<j; i++ ) 
+    for( n=0; n<n_vect; n++) {
+      beta[n] = (-s[i*n_vect+n])*H[j][i*n_vect+n] + (c[i*n_vect+n])*H[j][(i+1)*n_vect+n];
+      H[j][i*n_vect+n] = conj_PRECISION(c[i*n_vect+n])*H[j][i*n_vect+n] + conj_PRECISION(s[i*n_vect+n])*H[j][(i+1)*n_vect+n];
+      H[j][(i+1)*n_vect+n] = beta[n];
+    }
+
   // compute current Givens rotation
-  beta = (complex_PRECISION) sqrt( NORM_SQUARE_PRECISION(H[j][j]) + NORM_SQUARE_PRECISION(H[j][j+1]) );
-  s[j] = H[j][j+1]/beta; c[j] = H[j][j]/beta;
-  // update right column
-  gamma[j+1] = (-s[j])*gamma[j]; gamma[j] = conj_PRECISION(c[j])*gamma[j];
+  VECTOR_LOOP(n, n_vect, jj, beta[n+jj] = (complex_PRECISION) sqrt( NORM_SQUARE_PRECISION(H[j][j*n_vect+n+jj]) + NORM_SQUARE_PRECISION(H[j][(j+1)*n_vect+n+jj]) );)
+  VECTOR_LOOP(n, n_vect, jj, s[j*n_vect+n+jj] = H[j][(j+1)*n_vect+n+jj]/beta[n+jj];)
+  VECTOR_LOOP(n, n_vect, jj, c[j*n_vect+n+jj] = H[j][j*n_vect+n+jj]/beta[n+jj];)
+   // update right column
+  for( n=0; n<n_vect; n++ )
+    gamma[(j+1)*n_vect+n] = (-s[j*n_vect+n])*gamma[j*n_vect+n];
+  VECTOR_LOOP(n, n_vect, jj, gamma[j*n_vect+n+jj] = conj_PRECISION(c[j*n_vect+n+jj])*gamma[j*n_vect+n+jj];)
   // apply current Givens rotation
-  H[j][j] = beta; H[j][j+1] = 0;
+  VECTOR_LOOP(n, n_vect, jj, H[j][j*n_vect+n+jj] = beta[n+jj];)
+  VECTOR_LOOP(n, n_vect, jj, H[j][(j+1)*n_vect+n+jj] = 0;)
   
   PROF_PRECISION_STOP( _SMALL1, 6*j+6 );
   
@@ -942,7 +1231,7 @@ void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s,
 }
 
 
-void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, complex_PRECISION *y,
+void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y,
                                  complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol,
                                  gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
   
@@ -975,18 +1264,65 @@ void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, comple
   // x = x + V*y
   if ( ol ) {
     for ( i=0; i<=j; i++ ) {
-      vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l );
+      vector_PRECISION_saxpy( x, x, &V[i], y[i], start, end, l );
+    }
+  } else {
+    vector_PRECISION_scale( x, &V[0], y[0], start, end, l );
+    for ( i=1; i<=j; i++ ) {
+      vector_PRECISION_saxpy( x, x, &V[i], y[i], start, end, l );
+    }
+  }
+}
+
+
+void compute_solution_PRECISION_new( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y,
+                                 complex_PRECISION *gamma, complex_PRECISION **H, int j, int ol,
+                                 gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
+  
+  int i, k, n, jj, n_vect=g.num_rhs_vect;
+  // start and end indices for vector functions depending on thread
+  //int start;
+  //int end;
+  // compute start and end indices for core
+  // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
+  //compute_core_start_end(p->v_start+x->size*n_vec, p->v_end+x->size*n_vec, &start, &end, l, threading);
+
+  START_MASTER(threading)
+  
+  PROF_PRECISION_START( _SMALL2 );
+  
+  // backward substitution
+  for ( i=j; i>=0; i-- ) {
+    VECTOR_LOOP(n, n_vect, jj, y[i*n_vect+n+jj] = gamma[i*n_vect+n+jj];)
+    for ( k=i+1; k<=j; k++ ) {
+      for ( n=0; n<n_vect; n++ )
+        y[i*n_vect+n] -= H[k][i*n_vect+n]*y[k*n_vect+n];
+    }
+    VECTOR_LOOP(n, n_vect, jj, y[i*n_vect+n+jj] /= H[i][i*n_vect+n+jj];)
+  }
+  
+  PROF_PRECISION_STOP( _SMALL2, ((j+1)*(j+2))/2 + j+1 );
+  
+  END_MASTER(threading)
+  SYNC_MASTER_TO_ALL(threading)
+  
+  // x = x + V*y
+  if ( ol ) {
+    for ( i=0; i<=j; i++ ) {
+      vector_PRECISION_saxpy_new( x, x, &V[i], y, i, 1, l, threading );
     }
   } else {
-    vector_PRECISION_scale( x, V[0], y[0], start, end, l );
+    vector_PRECISION_scale_new( x, &V[0], y, 0, l, threading );
     for ( i=1; i<=j; i++ ) {
-      vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l );
+      vector_PRECISION_saxpy_new( x, x, &V[i], y, i, 1, l, threading );
     }
   }
 }
 
 
-void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter,
+
+
+void local_minres_PRECISION( vector_PRECISION *phi, vector_PRECISION *eta, vector_PRECISION *latest_iter,
                              int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
 /*********************************************************************************
@@ -1004,31 +1340,32 @@ void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_
 
   int i, nv = l->num_lattice_site_var, n = l->block_iter,
     end = (g.odd_even&&l->depth==0)?(start+nv*s->num_block_even_sites):(start+s->block_vector_size);
-  vector_PRECISION Dr = s->local_minres_buffer[0];
-  vector_PRECISION r = s->local_minres_buffer[1];
-  vector_PRECISION lphi = s->local_minres_buffer[2];
+  vector_PRECISION Dr, r, lphi;
+  Dr.vector_buffer = s->local_minres_buffer[0];
+  r.vector_buffer = s->local_minres_buffer[1];
+  lphi.vector_buffer = s->local_minres_buffer[2];
   complex_PRECISION alpha;
   void (*block_op)() = (l->depth==0)?(g.odd_even?apply_block_schur_complement_PRECISION:block_d_plus_clover_PRECISION)
                                     :coarse_block_operator_PRECISION;
-
-  vector_PRECISION_copy( r, eta, start, end, l );
-  vector_PRECISION_define( lphi, 0, start, end, l );
+  
+  vector_PRECISION_copy( &r, eta, start, end, l );
+  vector_PRECISION_define( &lphi, 0, start, end, l );
   
   for ( i=0; i<n; i++ ) {
     // Dr = blockD*r
-    block_op( Dr, r, start, s, l, no_threading );
+    block_op( &Dr, &r, start, s, l, no_threading );
     // alpha = <Dr,r>/<Dr,Dr>
-    alpha = local_xy_over_xx_PRECISION( Dr, r, start, end, l );
+    alpha = local_xy_over_xx_PRECISION( &Dr, &r, start, end, l );
     // phi += alpha * r
-    vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l );
+    vector_PRECISION_saxpy( &lphi, &lphi, &r, alpha, start, end, l );
     // r -= alpha * Dr
-    vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l );
+    vector_PRECISION_saxpy( &r, &r, &Dr, -alpha, start, end, l );
   }
   
-  if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, lphi, start, end, l );
-  if ( phi != NULL ) vector_PRECISION_plus( phi, phi, lphi, start, end, l );
-  vector_PRECISION_copy( eta, r, start, end, l );
-
+  if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, &lphi, start, end, l );
+  if ( phi != NULL ) vector_PRECISION_plus( phi, phi, &lphi, start, end, l );
+  vector_PRECISION_copy( eta, &r, start, end, l );
+  
   END_UNTHREADED_FUNCTION(threading)
 }
 
@@ -1051,36 +1388,36 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) {
   for( ol=0; ol<p->num_restart && finish==0; ol++ )  {
   
     if( ol == 0 && p->initial_guess_zero ) {
-      vector_PRECISION_copy( p->r, p->b, p->v_start, p->v_end, l );
+      vector_PRECISION_copy( &(p->r), &(p->b), p->v_start, p->v_end, l );
 
     } else {
-      apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); // compute w = D*x
-      vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); // compute r = b - w
+      apply_operator_PRECISION( &(p->w), &(p->x), p, l, no_threading ); // compute w = D*x
+      vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), p->v_start, p->v_end, l ); // compute r = b - w
     }
     
     if( ol == 0) {
-      r0_norm = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading );
+      r0_norm = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading );
     }
     
     for( il=0; il<p->restart_length && finish==0; il++ ) {
       
       j = il; iter++;
       
-      p->preconditioner( p->V[j], p->r, _NO_RES, l, no_threading );
-      apply_operator_PRECISION( p->Z[j], p->V[j], p, l, no_threading );
+      p->preconditioner( &(p->V[j]), &(p->r), _NO_RES, l, no_threading );
+      apply_operator_PRECISION( &(p->Z[j]), &(p->V[j]), p, l, no_threading );
       
       for( i=0; i<j; i++ ) {
-        beta = global_inner_product_PRECISION( p->Z[i], p->Z[j], p->v_start, p->v_end, l, no_threading ) / p->gamma[i];
-        vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l );
-        vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l );
+        beta = global_inner_product_PRECISION( &(p->Z[i]), &(p->Z[j]), p->v_start, p->v_end, l, no_threading ) / p->gamma[i];
+        vector_PRECISION_saxpy( &(p->V[j]), &(p->V[j]), &(p->V[i]), -beta, p->v_start, p->v_end, l );
+        vector_PRECISION_saxpy( &(p->Z[j]), &(p->Z[j]), &(p->Z[i]), -beta, p->v_start, p->v_end, l );
       }
       
-      p->gamma[j] = global_inner_product_PRECISION( p->Z[j], p->Z[j], p->v_start, p->v_end, l, no_threading );
-      alpha = global_inner_product_PRECISION( p->Z[j], p->r, p->v_start, p->v_end, l, no_threading ) / p->gamma[j];
-      vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l );
-      vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l );
+      p->gamma[j] = global_inner_product_PRECISION( &(p->Z[j]), &(p->Z[j]), p->v_start, p->v_end, l, no_threading );
+      alpha = global_inner_product_PRECISION( &(p->Z[j]), &(p->r), p->v_start, p->v_end, l, no_threading ) / p->gamma[j];
+      vector_PRECISION_saxpy( &(p->x), &(p->x), &(p->V[j]), alpha, p->v_start, p->v_end, l );
+      vector_PRECISION_saxpy( &(p->r), &(p->r), &(p->Z[j]), -alpha, p->v_start, p->v_end, l );
       
-      alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / r0_norm;
+      alpha = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading ) / r0_norm;
       if ( creal(alpha) < p->tol ) {
         finish = 1;
         break;
@@ -1094,9 +1431,9 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) {
   
   if ( p->timing || p->print ) t1 = MPI_Wtime();
   if ( p->print ) {
-    apply_operator_PRECISION( p->w, p->x, p, l, no_threading );
-    vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l );
-    beta = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading );
+    apply_operator_PRECISION( &(p->w), &(p->x), p, l, no_threading );
+    vector_PRECISION_minus( &(p->r), &(p->b), &(p->w), p->v_start, p->v_end, l );
+    beta = global_norm_PRECISION( &(p->r), p->v_start, p->v_end, l, no_threading );
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
     printf0("+----------------------------------------------------------+\n");
     printf0("\n");
diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h
index 8a1f2e8..c44cbb5 100644
--- a/src/linsolve_generic.h
+++ b/src/linsolve_generic.h
@@ -25,23 +25,27 @@
   struct Thread;
   
   void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p );
-  void fgmres_PRECISION_struct_alloc( int m, int n, long int vl, PRECISION tol, const int type, const int prec_kind,
-                                      void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct* l );
+  void fgmres_PRECISION_struct_alloc( int m, int n, const int vl_type, PRECISION tol, const int type, const int prec_kind,
+                                      void (*precond)(), void (*eval_op)(), gmres_PRECISION_struct *p, level_struct *l );
   void fgmres_PRECISION_struct_free( gmres_PRECISION_struct *p, level_struct *l );
   
   int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
   void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l );
   void cgn_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
   void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *threading );
-  void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter,
+  void local_minres_PRECISION( vector_PRECISION *phi, vector_PRECISION *eta, vector_PRECISION *latest_iter,
                                int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w,
+  int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w,
+                              complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
+                              gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
+  int arnoldi_step_PRECISION_new( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION *w,
                               complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
                               gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
   void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s,
                             complex_PRECISION *c, complex_PRECISION *gamma, int j,
                             level_struct *l, struct Thread *threading );
-  void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma,
+  void compute_solution_PRECISION( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma,
+                                   complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
+  void compute_solution_PRECISION_new( vector_PRECISION *x, vector_PRECISION *V, complex_PRECISION *y, complex_PRECISION *gamma, 
                                    complex_PRECISION **H, int j, int ol, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
-  
 #endif
diff --git a/src/main.c b/src/main.c
index ef2c3cb..7e67545 100644
--- a/src/main.c
+++ b/src/main.c
@@ -89,7 +89,7 @@ int main( int argc, char **argv ) {
     
     solve_driver( &l, &threading );
   }
-  
+  printf0("Number of rhs vectors = %d\n", g.num_rhs_vect);
   finalize_common_thread_data(commonthreaddata);
   finalize_no_threading(no_threading);
   free(commonthreaddata);
diff --git a/src/main.h b/src/main.h
index cf15fde..ebc9e51 100644
--- a/src/main.h
+++ b/src/main.h
@@ -32,6 +32,10 @@
 #ifndef MAIN_HEADER
   #define MAIN_HEADER
 
+  #define num_loop 4
+ 
+  #define VECTOR_LOOP(j, jmax, jj, instructions) for( j=0; j<jmax; j+=num_loop) {_Pragma("unroll") _Pragma("vector aligned") _Pragma("ivdep") for( jj=0; jj<num_loop; jj++) { instructions; }} 
+
   #define STRINGLENGTH 500
   
   #define _FILE_OFFSET_BITS 64
@@ -39,9 +43,9 @@
   #define EPS_double 1E-14
 
   #define HAVE_TM      // flag for enable twisted mass
-  #define HAVE_TM1p1   // flag for enable doublet for twisted mass
+  //#define HAVE_TM1p1   // flag for enable doublet for twisted mass
 
-  #undef INIT_ONE_PREC // flag undef for enabling additional features in the lib
+  #define INIT_ONE_PREC // flag undef for enabling additional features in the lib
   
   #define FOR2( e )  { e e }
   #define FOR3( e )  { e e e }
@@ -84,16 +88,6 @@
   #define abs_double fabs
   #define abs_float fabsf
   
-#ifdef SSE
-  #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \
-  printf0("malloc of \"%s\" failed: pointer is not NULL (%s:%d).\n", #variable, __FILE__, __LINE__ ); } \
-  if ( (length) > 0 ) { variable = (kind*) memalign( 64, sizeof(kind) * (length) ); } \
-  if ( variable == NULL && (length) > 0 ) { \
-  error0("malloc of \"%s\" failed: no memory allocated (%s:%d), current memory used: %lf GB.\n", \
-  #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \
-  g.cur_storage += (sizeof(kind) * (length))/(1024.0*1024.0); \
-  if ( g.cur_storage > g.max_storage ) g.max_storage = g.cur_storage; }while(0)
-#else
   #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \
   printf0("malloc of \"%s\" failed: pointer is not NULL (%s:%d).\n", #variable, __FILE__, __LINE__ ); } \
   if ( (length) > 0 ) { variable = (kind*) malloc( sizeof(kind) * (length) ); } \
@@ -102,7 +96,6 @@
   #variable, __FILE__, __LINE__, g.cur_storage/1024.0 ); } \
   g.cur_storage += (sizeof(kind) * (length))/(1024.0*1024.0); \
   if ( g.cur_storage > g.max_storage ) g.max_storage = g.cur_storage; }while(0)
-#endif
 
   #define FREE( variable, kind, length ) do{ if ( variable != NULL ) { \
   free( variable ); variable = NULL; g.cur_storage -= (sizeof(kind) * (length))/(1024.0*1024.0); } else { \
@@ -180,6 +173,9 @@
   #else
   #define DEBUGOUTPUT( A, FORMAT )
   #endif
+  
+  #define INDEX_NV_LV_SV( NV, NUM_NV, LV, NUM_LV, SV, NUM_SV ) SV+NUM_SV*LV+NUM_SV*NUM_LV*NV
+  #define INDEX_LV_SV_NV( NV, NUM_NV, LV, NUM_LV, SV, NUM_SV ) NV+NUM_NV*SV+NUM_NV*NUM_SV*LV
 
   #include "vectorization_control.h"
   #include "threading.h"
@@ -189,7 +185,7 @@
   enum { _NO_DEFAULT_SET, _DEFAULT_SET };
   enum { _NO_REORDERING, _REORDER };
   enum { _ADD, _COPY };
-  enum { _ORDINARY, _SCHWARZ, _ODDEVEN };
+  enum { _ORDINARY, _SCHWARZ, _ODDEVEN, _INNER };
   enum { _RES, _NO_RES };
   enum { _STANDARD, _LIME }; //formats
   enum { _READ, _WRITE };
@@ -201,10 +197,11 @@
   enum { _LEFT, _RIGHT, _NOTHING };
   enum { _PERIODIC, _ANTIPERIODIC, _TWISTED, _DIRICHLET };
   enum { _GIP, _PIP, _LA2, _LA6, _LA8, _LA, _CPY, _SET, _PR, _SC, _NC, _SM, _OP_COMM, _OP_IDLE, _ALLR, _GD_COMM, _GD_IDLE, _GRAM_SCHMIDT, _GRAM_SCHMIDT_ON_AGGREGATES,
-      _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant!
+      _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _RS, _NUM_PROF }; // _NUM_PROF has always to be the last constant!
   enum { _VTS = 20 };
   enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB };
-  
+  enum { _NV_LV_SV, _LV_SV_NV }; //vector layout
+
   typedef struct block_struct {
     int start, color, no_comm, *bt;
   } block_struct;
@@ -392,6 +389,10 @@
     // bc: 0 dirichlet, 1 periodic, 2 anti-periodic
     int bc; 
     
+    // number of rhs vectors (b) to be solved at the same time (hopefully)
+    int num_rhs_vect;
+    
+
     complex_double **gamma;
     var_table vt;
     
@@ -481,24 +482,8 @@
 // functions
 #include "clifford.h"
 
-#ifdef SSE
-#include "vectorization_dirac_float.h"
-#include "vectorization_dirac_double.h"
-#include "blas_vectorized.h"
-#include "sse_blas_vectorized.h"
-#include "sse_complex_float_intrinsic.h"
-#include "sse_complex_double_intrinsic.h"
-#include "sse_coarse_operator_float.h"
-#include "sse_coarse_operator_double.h"
-#include "sse_linalg_float.h"
-#include "sse_linalg_double.h"
-#include "sse_interpolation_float.h"
-#include "sse_interpolation_double.h"
-#else
-//no intrinsics
 #include "interpolation_float.h"
 #include "interpolation_double.h"
-#endif
 
 #include "data_float.h"
 #include "data_double.h"
@@ -543,6 +528,8 @@
 #include "var_table.h"
 #include "main_post_def_float.h"
 #include "main_post_def_double.h"
+#include "vector_float.h"
+#include "vector_double.h"
 #ifdef HAVE_LIME
 #include <lime.h>
 #include <lime_config.h>
diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h
index 690ef6b..4817c43 100644
--- a/src/main_post_def_generic.h
+++ b/src/main_post_def_generic.h
@@ -26,35 +26,35 @@
   #include "dirac_PRECISION.h"
   #include "coarse_operator_PRECISION.h"
 
-  static inline void apply_operator_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
+  static inline void apply_operator_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
 
     p->eval_operator( output, input, p->op, l, threading );
 
   }
   
-  static inline void apply_operator_dagger_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
+  static inline void apply_operator_dagger_PRECISION( vector_PRECISION *output, vector_PRECISION *input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
 
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
-      tau1_gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading );
+      tau1_gamma5_PRECISION( &(l->vbuf_PRECISION[6]), input, l, threading );
     } else
 #endif
       {
-        gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading );
+        gamma5_PRECISION( &(l->vbuf_PRECISION[6]), input, l, threading );
 #ifdef HAVE_TM
         //TODO: change_mu_sign_PRECISION( p->op, l, threading );
 #endif
       }
 
-    apply_operator_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], p, l, threading );
+    apply_operator_PRECISION( &(l->vbuf_PRECISION[7]), &(l->vbuf_PRECISION[6]), p, l, threading );
 
 #ifdef HAVE_TM1p1
     if( g.n_flavours == 2 ) {
-      tau1_gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading );
+      tau1_gamma5_PRECISION( output,&(l->vbuf_PRECISION[7]), l, threading );
     } else
 #endif
       {
-        gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading );
+        gamma5_PRECISION( output, &(l->vbuf_PRECISION[7]), l, threading );
 #ifdef HAVE_TM
         //TODO: change_mu_sign_PRECISION( p->op, l, threading );
 #endif
diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h
index d485518..521e5e8 100644
--- a/src/main_pre_def_generic.h
+++ b/src/main_pre_def_generic.h
@@ -21,17 +21,26 @@
 
 #ifndef MAIN_PRE_DEF_PRECISION_HEADER
   #define MAIN_PRE_DEF_PRECISION_HEADER
-  
+
   typedef PRECISION complex complex_PRECISION;
   typedef PRECISION complex *config_PRECISION;
-  typedef PRECISION complex *vector_PRECISION;
+  typedef PRECISION complex *buffer_PRECISION;
+
+  typedef struct {
+    buffer_PRECISION vector_buffer;
+    int num_vect;
+    int layout;
+    int type;
+    int size;
+    struct level_struct *l;
+  } vector_PRECISION;
 
   typedef struct {
     int length[8], *boundary_table[8], max_length[4],
         comm_start[8], in_use[8], offset, comm,
         num_even_boundary_sites[8], num_odd_boundary_sites[8],
         num_boundary_sites[8];
-    vector_PRECISION buffer[8];
+    buffer_PRECISION buffer[8];
     MPI_Request sreqs[8], rreqs[8];
   } comm_PRECISION_struct;
   
@@ -52,12 +61,9 @@
         *index_table, *neighbor_table, *translation_table, table_dim[4],
         *backward_neighbor_table,
         table_mod_dim[4], *config_boundary_table[4];
-    vector_PRECISION *buffer, prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX;
+    vector_PRECISION *buffer;
+    buffer_PRECISION prnT, prnZ, prnY, prnX, prpT, prpZ, prpY, prpX;
     comm_PRECISION_struct c;
-    OPERATOR_TYPE_PRECISION *D_vectorized;
-    OPERATOR_TYPE_PRECISION *D_transformed_vectorized;
-    OPERATOR_TYPE_PRECISION *clover_vectorized;
-    OPERATOR_TYPE_PRECISION *clover_oo_inv_vectorized;
 #ifdef HAVE_TM
     double mu, mu_odd_shift, mu_even_shift;
     config_PRECISION tm_term;
@@ -65,8 +71,6 @@
 #ifdef HAVE_TM1p1
     double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift;
     config_PRECISION epsbar_term, clover_doublet_oo_inv;
-    OPERATOR_TYPE_PRECISION *clover_doublet_vectorized;
-    OPERATOR_TYPE_PRECISION *clover_doublet_oo_inv_vectorized;
 #endif
   } operator_PRECISION_struct;
   
@@ -87,7 +91,7 @@
     operator_PRECISION_struct op;
     vector_PRECISION buf1, buf2, buf3, buf4, buf5;
     vector_PRECISION oe_buf[4];
-    vector_PRECISION local_minres_buffer[3];
+    buffer_PRECISION local_minres_buffer[3];
     int block_oe_offset, *index[4], dir_length[4], num_blocks, num_colors,
         dir_length_even[4], dir_length_odd[4], *oe_index[4],
         num_block_even_sites, num_block_odd_sites, num_aggregates,
diff --git a/src/oddeven_generic.c b/src/oddeven_generic.c
index 9da8cce..32d98f4 100644
--- a/src/oddeven_generic.c
+++ b/src/oddeven_generic.c
@@ -191,250 +191,253 @@ void selfcoupling_LU_doublet_decomposition_PRECISION( const config_PRECISION out
 #endif
 
 
-static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) {
+static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION L,
+						       int start, int end ) {
 
 /*********************************************************************************
 * Solves L*(L^H)*x = b for x, i.e., the clover coupling for a single lattice 
 * site.
-* - vector_PRECISION b: Right hand side.
-* - vector_PRECISION x: Solution.
+* - vector_PRECISION *b: Right hand side.
+* - vector_PRECISION *x: Solution.
 * - config_PRECISION L: Cholesky factor ( lower triangular matrix )
 *********************************************************************************/
   
-  register int i, j;
+  register int id, i, j;
   int n;
+  buffer_PRECISION x_pt = x->vector_buffer, b_pt = b->vector_buffer;
+  x_pt += start; b_pt += start;
 
-  for ( n=0; n<2; n++ ) {
-    // forward substitution with L
-    for ( i=0; i<6; i++ ) {
-      x[i] = b[i];
-      for ( j=0; j<i; j++ ) {
-        x[i] = x[i] - *L * x[j]; L++;
+  L += (start/12)*42;
+  
+  for ( id=start; id<end; id+=12 ) {
+    for ( n=0; n<2; n++ ) {
+      // forward substitution with L
+      for ( i=0; i<6; i++ ) {
+        x_pt[i] = b_pt[i];
+        for ( j=0; j<i; j++ ) {
+          x_pt[i] = x_pt[i] - *L * x_pt[j]; L++;
+        }
+        x_pt[i] = x_pt[i] / *L; L++;
       }
-      x[i] = x[i] / *L; L++;
-    }
-    L -= 21;
-    // backward substitution with L^H
-    for ( i=5; i>=0; i-- ) {
-      for ( j=i+1; j<6; j++ ) {
-        x[i] = x[i] - conj_PRECISION(L[(j*(j+1))/2 + i]) * x[j];
+      L -= 21;
+      // backward substitution with L^H
+      for ( i=5; i>=0; i-- ) {
+        for ( j=i+1; j<6; j++ ) {
+          x_pt[i] = x_pt[i] - conj_PRECISION(L[(j*(j+1))/2 + i]) * x_pt[j];
+        }
+        x_pt[i] = x_pt[i] / conj_PRECISION(L[(i*(i+1))/2 + i]);
       }
-      x[i] = x[i] / conj_PRECISION(L[(i*(i+1))/2 + i]);
+      x_pt+=6;
+      b_pt+=6;
+      L+=21;
     }
-    x+=6;
-    b+=6;
-    L+=21;
+    x_pt+=12; b_pt+=12; L+=42;
   }
 }
 
-static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION LU ) {
+
+static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION *x, vector_PRECISION *b, config_PRECISION LU,
+ 						      int start, int end ) {
 
 /*********************************************************************************
 * Solves L*U*x = b for x, i.e., the clover coupling for a single lattice 
 * site.
-* - vector_PRECISION b: Right hand side.
-* - vector_PRECISION x: Solution.
+* - vector_PRECISION *b: Right hand side.
+* - vector_PRECISION *x: Solution.
 * - config_PRECISION L: Lower matrix from modified LU decomposition
 * Note: U is given by u_{ii}=1, u_{ij}=l_{ji}* / l_{ii} 
 *********************************************************************************/
   
-  register int i, j, n;
+  register int id, i, j, n;
+  buffer_PRECISION x_pt = x->vector_buffer, b_pt = b->vector_buffer;
+  x_pt += start; b_pt += start;
 
 #ifdef HAVE_TM1p1
-  if( g.n_flavours == 2)
-    for ( n=0; n<2; n++ ) {
-      // solve x = U^(-1) L^(-1) b
-      // forward substitution with L
-      for ( i=0; i<12; i++ ) {
-        x[i] = b[i];
-        for ( j=0; j<i; j++ ) {
-          x[i] = x[i] - LU[i*12+j]*x[j];
-        }
-      }
-      // backward substitution with U
-      for ( i=12-1; i>=0; i-- ) {
-        for ( j=i+1; j<12; j++ ) {
-          x[i] = x[i] - LU[i*12+j]*x[j];
+  if( g.n_flavours == 2) {
+    LU += (start/24)*288;
+    for ( id=start; id<end; id+=24 ) {
+      for ( n=0; n<2; n++ ) {
+        // solve x = U^(-1) L^(-1) b
+        // forward substitution with L
+        for ( i=0; i<12; i++ ) {
+          x_pt[i] = b_pt[i];
+          for ( j=0; j<i; j++ ) {
+            x_pt[i] = x_pt[i] - LU[i*12+j]*x_pt[j];
+          }
         }
-        x[i] = x[i]/LU[i*(12+1)];
-      }
-      x+=12;
-      b+=12;
-      LU+=12*12;
-    }
-  else
-#endif
-    for ( n=0; n<2; n++ ) {
-      // solve x = U^(-1) L^(-1) b
-      // forward substitution with L
-      for ( i=0; i<6; i++ ) {
-        x[i] = b[i];
-        for ( j=0; j<i; j++ ) {
-          x[i] = x[i] - LU[i*6+j]*x[j];
+        // backward substitution with U
+        for ( i=12-1; i>=0; i-- ) {
+          for ( j=i+1; j<12; j++ ) {
+            x_pt[i] = x_pt[i] - LU[i*12+j]*x_pt[j];
+          }
+          x_pt[i] = x_pt[i]/LU[i*(12+1)];
         }
+        x_pt+=12;
+        b_pt+=12;
+        LU+=12*12;
       }
-      // backward substitution with U
-      for ( i=6-1; i>=0; i-- ) {
-        for ( j=i+1; j<6; j++ ) {
-          x[i] = x[i] - LU[i*6+j]*x[j];
+      x_pt+=24; b_pt+=24; LU+=288;
+    }
+  } else
+#endif
+    {
+      LU += (start/12)*72;
+      for ( id=start; id<end; id+=12 ) {
+        for ( n=0; n<2; n++ ) {
+          // solve x = U^(-1) L^(-1) b
+          // forward substitution with L
+          for ( i=0; i<6; i++ ) {
+            x_pt[i] = b_pt[i];
+            for ( j=0; j<i; j++ ) {
+              x_pt[i] = x_pt[i] - LU[i*6+j]*x_pt[j];
+            }
+          }
+          // backward substitution with U
+          for ( i=6-1; i>=0; i-- ) {
+            for ( j=i+1; j<6; j++ ) {
+              x_pt[i] = x_pt[i] - LU[i*6+j]*x_pt[j];
+            }
+            x_pt[i] = x_pt[i]/LU[i*(6+1)];
+          }
+          x_pt+=6;
+          b_pt+=6;
+          LU+=6*6;
         }
-        x[i] = x[i]/LU[i*(6+1)];
       }
-      x+=6;
-      b+=6;
-      LU+=6*6;
+      x_pt+=12; b_pt+=12; LU+=72;
     }
 }
 
-
-static inline void LLH_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION L ) {
+static inline void LLH_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION L,
+					   int start, int end ) {
 
 /*********************************************************************************
 * Applies the clover coupling term to a vector, by multiplying L^H 
 * and then L. 
-* - vector_PRECISION x: Input vector.
-* - vector_PRECISION y: Output vector.
+* - vector_PRECISION *x: Input vector.
+* - vector_PRECISION *y: Output vector.
 * - config_PRECISION L: Cholesky factor ( lower triangular matrix )
 *********************************************************************************/
   
-  register int i, j;
+  register int id, i, j;
   int n;
   complex_PRECISION z[6];
+  buffer_PRECISION x_pt = x->vector_buffer, y_pt = y->vector_buffer;
+  x_pt += start; y_pt += start;  
   
-  for ( n=0; n<2; n++ ) {
-    // z = L^H x
-    for ( j=0; j<6; j++ ) { // columns
-      for ( i=0; i<j; i++ ) { // rows
-        z[i] += conj_PRECISION(*L)*x[j]; L++;
+  L += (start/12)*42;
+  
+  for ( id=start; id<end; id+=12 ) {
+    for ( n=0; n<2; n++ ) {
+      // z = L^H x
+      for ( j=0; j<6; j++ ) { // columns
+        for ( i=0; i<j; i++ ) { // rows
+          z[i] += conj_PRECISION(*L)*x_pt[j]; L++;
+        }
+        z[j] = conj_PRECISION(*L)*x_pt[j]; L++;
       }
-      z[j] = conj_PRECISION(*L)*x[j]; L++;
-    }
-    L-=21;
-    // y = L*z;
-    for ( i=0; i<6; i++ ) { // rows
-      y[i] = *L * z[0]; L++;
-      for ( j=1; j<=i; j++ ) { // columns
-        y[i] += *L * z[j]; L++;
+      L-=21;
+      // y = L*z;
+      for ( i=0; i<6; i++ ) { // rows
+        y_pt[i] = *L * z[0]; L++;
+        for ( j=1; j<=i; j++ ) { // columns
+          y_pt[i] += *L * z[j]; L++;
+        }
       }
+      x_pt+=6;
+      y_pt+=6;
     }
-    x+=6;
-    y+=6;
+    x_pt+=12; y_pt+=12; L+=42;
   }
 }
 
-static inline void LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, config_PRECISION LU ) {
+static inline void LU_multiply_PRECISION( vector_PRECISION *y, vector_PRECISION *x, config_PRECISION LU,
+					  int start, int end ) {
 
 /*********************************************************************************
 * Applies the clover coupling term to a vector, by multiplying L^H 
 * and then L. 
-* - vector_PRECISION x: Input vector.
-* - vector_PRECISION y: Output vector.
+* - vector_PRECISION *x: Input vector.
+* - vector_PRECISION *y: Output vector.
 * - config_PRECISION LU: LU decomposition
 *********************************************************************************/
 
-  register int i, j, n;
+  register int id, i, j, n;
+  buffer_PRECISION x_pt = x->vector_buffer, y_pt = y->vector_buffer;
+  x_pt += start; y_pt += start;
 
 #ifdef HAVE_TM1p1
-  if( g.n_flavours == 2)
-    for ( n=0; n<2; n++ ) {
-      for ( i=0; i<12; i++ ) {
-        y[i] = LU[i*(12+1)]*x[i];
-        for ( j=i+1; j<12; j++ )
-          y[i] += LU[i*12+j]*x[j];
+  if( g.n_flavours == 2) {
+    LU += (start/24)*288;
+    for ( id=start; id<end; id+=24 ) {
+      for ( n=0; n<2; n++ ) {
+	for ( i=0; i<12; i++ ) {
+	  y_pt[i] = LU[i*(12+1)]*x_pt[i];
+	  for ( j=i+1; j<12; j++ )
+	    y_pt[i] += LU[i*12+j]*x_pt[j];
+	}
+	// multiplication with L
+	for ( i=12-1; i>0; i-- )
+	  for ( j=0; j<i; j++ )
+	    y_pt[i] += LU[i*12+j]*y_pt[j];
+	
+	x_pt+=12;
+	y_pt+=12;
+	LU+=12*12;
       }
-      // multiplication with L
-      for ( i=12-1; i>0; i-- )
-        for ( j=0; j<i; j++ )
-          y[i] += LU[i*12+j]*y[j];
-
-      x+=12;
-      y+=12;
-      LU+=12*12;
-    }
-  else
-#endif
-    for ( n=0; n<2; n++ ) {
-      for ( i=0; i<6; i++ ) {
-        y[i] = LU[i*(6+1)]*x[i];
-        for ( j=i+1; j<6; j++ )
-          y[i] += LU[i*6+j]*x[j];
+      x_pt+=24; y_pt+=24; LU+=288;
+    }
+  } else
+#endif
+    {
+      LU += (start/12)*72;
+      for ( id=start; id<end; id+=12 ) {
+	for ( n=0; n<2; n++ ) {
+	  for ( i=0; i<6; i++ ) {
+	    y_pt[i] = LU[i*(6+1)]*x_pt[i];
+	    for ( j=i+1; j<6; j++ )
+	      y_pt[i] += LU[i*6+j]*x_pt[j];
+	  }
+	  // multiplication with L
+	  for ( i=6-1; i>0; i-- )
+	    for ( j=0; j<i; j++ )
+	      y_pt[i] += LU[i*6+j]*y_pt[j];
+	  
+	  x_pt+=6;
+	  y_pt+=6;
+	  LU+=6*6;
+	}
       }
-      // multiplication with L
-      for ( i=6-1; i>0; i-- )
-        for ( j=0; j<i; j++ )
-          y[i] += LU[i*6+j]*y[j];
-
-      x+=6;
-      y+=6;
-      LU+=6*6;
+      x_pt+=12; y_pt+=12; LU+=72;
     }
 }
 
 
-void diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, 
+void diag_ee_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op, 
                         level_struct *l, int start, int end ) {
 
 /*********************************************************************************
 * Applies the even-even block of the odd even decomposition to a vector.
-* - vector_PRECISION x: Input vector.
-* - vector_PRECISION y: Output vector.
+* - vector_PRECISION *x: Input vector.
+* - vector_PRECISION *y: Output vector.
 *********************************************************************************/
 
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2) {
-    x += start; y += start;
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-    PRECISION *sc_pt = op->clover_doublet_vectorized + (start/24)*288;
-    PRECISION *x_pt = (PRECISION*)x;
-    PRECISION *y_pt = (PRECISION*)y;
-    for ( int i=start; i<end; i+=24 ) {
-      sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
-      y_pt+=2*24; x_pt+=2*24; sc_pt+=288;
-    }
-    config_PRECISION epsbar_term = op->epsbar_term+(start/24)*12;  
-    if ( g.n_flavours == 2 &&
-         ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) )
-      apply_doublet_coupling_PRECISION( x, y, epsbar_term, end-start );
-#else 
-    config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288;
-    // diagonal blocks applied to the even sites
-    for ( int i=start; i<end; i+=24 ) {
-      LU_multiply_PRECISION( y, x, sc );
-      y+=24; x+=24; sc+=288;
-    }    
-#endif
+    LU_multiply_PRECISION( y, x, op->clover_doublet_oo_inv, start, end);
   } else {
 #endif
-    x += start; y += start;
     if ( g.csw ) {
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-      PRECISION *sc_pt = op->clover_vectorized + (start/12)*144;
-      PRECISION *x_pt = (PRECISION*)x;
-      PRECISION *y_pt = (PRECISION*)y;
-      for ( int i=start; i<end; i+=12 ) {
-        sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
-        y_pt+=2*12; x_pt+=2*12; sc_pt+=144;
-      }
-#elif defined(HAVE_TM) 
-      config_PRECISION sc = op->clover + (start/12)*72;
-      // diagonal blocks applied to the even sites
-      for ( int i=start; i<end; i+=12 ) {
-        LU_multiply_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=72;
-      }
+#if defined(HAVE_TM) 
+      LU_multiply_PRECISION( y, x, op->clover, start, end);
 #else
-      config_PRECISION sc = op->clover + (start/12)*42;
-      // diagonal blocks applied to the even sites
-      for ( int i=start; i<end; i+=12 ) {
-        LLH_multiply_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=42;
-      }
+      LLH_multiply_PRECISION( y, x, op->clover, start, end );
 #endif
     } else {
       config_PRECISION sc = op->clover + start;
-      for ( int i=start; i<end; i+=12 ) {
-        FOR12( *y = (*x)*(*sc); y++; x++; sc++; )
+      for ( int i=start; i<end;  ) {
+        FOR12( y->vector_buffer[i] = x->vector_buffer[i]*sc[i]; i++;)
       }
     }
 #ifdef HAVE_TM1p1
@@ -443,38 +446,29 @@ void diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI
 }
 
 // for debugging only
-void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op,
+void diag_ee_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op,
                             level_struct *l, struct Thread *threading ) {
 
   START_UNTHREADED_FUNCTION(threading)
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2) {
-    int i, n1 = op->num_even_sites;
-    config_PRECISION sc = op->clover_doublet_oo_inv;
-    // diagonal blocks applied to the even sites
-    for ( i=0; i<n1; i++ ) {
-        LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-        y+=24; x+=24; sc+=288;
-    } 
+    int n1 = op->num_even_sites;
+    LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover_doublet_oo_inv, 0, n1*24);
   } else {
 #endif
     int i, n1 = op->num_even_sites;
-    config_PRECISION sc = op->clover;
     if ( g.csw ) {
       // diagonal blocks applied to the even sites
-      for ( i=0; i<n1; i++ ) {
 #ifndef HAVE_TM
-        LLH_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=42;
+      LLH_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, 0, n1*12 );
 #else
-        LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=72;
+      LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, 0, n1*12 );
 #endif
-      }
     } else {
-      for ( i=0; i<n1; i++ ) {
-        FOR12( *y = (*x)/(*sc); y++; x++; sc++; )
+      config_PRECISION sc = op->clover;
+      for ( i=0; i<n1*12;  ) {
+        FOR12( y->vector_buffer[i] = x->vector_buffer[i]/sc[i]; i++;)
       }
     }
 #ifdef HAVE_TM1p1
@@ -484,51 +478,35 @@ void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRE
 }
 
 // for debugging only
-void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op,
+void diag_oo_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op,
     level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
 * Applies the odd-odd block of the odd even decomposition to a vector.
-* - vector_PRECISION x: Input vector.
-* - vector_PRECISION y: Output vector.
+* - vector_PRECISION *x: Input vector.
+* - vector_PRECISION *y: Output vector.
 *********************************************************************************/
 
   START_UNTHREADED_FUNCTION(threading)
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2) {
-    int i, n1 = op->num_even_sites, n2 = op->num_odd_sites;
-    config_PRECISION sc = op->clover_doublet_oo_inv + n1*288;
-    x += n1*24; y += n1*24;
-    // diagonal blocks applied to the even sites
-    for ( i=0; i<n2; i++ ) {
-      LU_multiply_PRECISION( y, x, sc );
-      y+=24; x+=24; sc+=288;
-    }
+    int n1 = op->num_even_sites, n2 = op->num_odd_sites;
+    LU_multiply_PRECISION( y, x, op->clover_doublet_oo_inv, n1*24, (n1+n2)*24 );
   } else {
 #endif
     int i, n1 = op->num_even_sites, n2 = op->num_odd_sites;
-    config_PRECISION sc = op->clover;
-    x += n1*12; y += n1*12;
     // diagonal blocks applied to the odd sites
     if ( g.csw ) {
 #ifndef HAVE_TM
-      sc += n1*42;
-      for ( i=0; i<n2; i++ ) {
-        LLH_multiply_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=42;
-      }
+      LLH_multiply_PRECISION( y, x, op->clover, n1*12, (n1+n2)*12 );
 #else
-      sc += n1*72;
-      for ( i=0; i<n2; i++ ) {
-        LU_multiply_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=72;
-      }
+      LU_multiply_PRECISION( y, x, op->clover, n1*12, (n1+n2)*12 );
 #endif
     } else {
-      sc += n1*12;
-      for ( i=0; i<n2; i++ ) {
-        FOR12( *y = (*x)*(*sc); y++; x++; sc++; )
+      config_PRECISION sc = op->clover + n1*12;
+      for ( i=n1*12; i<(n1+n2)*12;  ) {
+        FOR12( y->vector_buffer[i] = x->vector_buffer[i]*sc[i]; i++;)
       }
     }
 #ifdef HAVE_TM1p1
@@ -539,59 +517,26 @@ void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI
 }
 
 
-void diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op,
+void diag_oo_inv_PRECISION( vector_PRECISION *y, vector_PRECISION *x, operator_PRECISION_struct *op,
                             level_struct *l, int start, int end ) {
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2) {
-    x += start; y += start;
     // inverse diagonal blocks applied to the odd sites
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-    PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288;
-    PRECISION *x_pt = (PRECISION*)x;
-    PRECISION *y_pt = (PRECISION*)y;
-    for ( int i=start; i<end; i+=24 ) {
-      sse_site_clover_doublet_PRECISION( y_pt, x_pt, sc_pt );
-      y_pt+=2*24; x_pt+=2*24; sc_pt+=2*288;
-    }
-#else
-    config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288;
-    for ( int i=start; i<end; i+=24 ) {
-      LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-      y+=24; x+=24; sc+=288;
-    }    
-#endif
+    LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover_doublet_oo_inv, start, end );
   } else {
 #endif
-    config_PRECISION sc = op->clover;
-    x += start; y += start;
     // inverse diagonal blocks applied to the odd sites
     if ( g.csw ) {
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-      PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start);
-      PRECISION *x_pt = (PRECISION*)x;
-      PRECISION *y_pt = (PRECISION*)y;
-      for ( int i=start; i<end; i+=12 ) {
-        sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
-        y_pt+=2*12; x_pt+=2*12; sc_pt+=2*2*36;
-      }
-#elif defined(HAVE_TM)
-      sc += (start/12)*72;
-      for ( int i=start; i<end; i+=12 ) {
-        LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=72;
-      }    
+#if defined(HAVE_TM)
+      LU_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, start, end );
 #else
-      sc += (start/12)*42;
-      for ( int i=start; i<end; i+=12 ) {
-        LLH_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-        y+=12; x+=12; sc+=42;
-      }
+      LLH_perform_fwd_bwd_subs_PRECISION( y, x, op->clover, start, end );
 #endif
     } else {
-      sc += start;
-      for ( int i=start; i<end; i+=12 ) {
-        FOR12( *y = (*x)/(*sc); y++; x++; sc++; )
+      config_PRECISION sc = op->clover + start;
+      for ( int i=start; i<end; ) {
+        FOR12( y->vector_buffer[i] = x->vector_buffer[i]/sc[i]; i++;)
       }
     }
 #ifdef HAVE_TM1p1
@@ -655,31 +600,12 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
     MALLOC( op->clover, complex_PRECISION, lu_dec_size*n );
     Aee = op->clover;
     Aoo = op->clover + op->num_even_sites*lu_dec_size;
-              /* TODO: fix the vectorized part
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-    MALLOC_HUGEPAGES( op->clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36, 4*SIMD_LENGTH_PRECISION );
-    PRECISION *Aee_vectorized = op->clover_vectorized;
-    PRECISION *Aoo_vectorized = op->clover_vectorized + op->num_even_sites*2*2*36;
-#endif    
-              */
     for ( t=0; t<le[T]; t++ )
       for ( z=0; z<le[Z]; z++ )
         for ( y=0; y<le[Y]; y++ )
           for ( x=0; x<le[X]; x++ ) {
             if ( (t+z+y+x+oe_offset)%2 == 1 ) {
               // odd sites
-              /* TODO: fix the vectorized part
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-              PRECISION tmp[144] __attribute__((aligned(64)));
-              sse_set_clover_PRECISION( tmp, sc_in );
-#ifdef HAVE_TM
-              if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
-                sse_add_diagonal_clover_PRECISION( tmp, tm_term_in );
-#endif
-              sse_site_clover_invert_PRECISION( tmp, Aoo_vectorized );
-              Aoo_vectorized += 2*2*36;
-#endif
-              */
 #ifndef HAVE_TM
               selfcoupling_cholesky_decomposition_PRECISION( Aoo, sc_in );
 #else
@@ -693,15 +619,6 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
               Aoo += lu_dec_size;
             } else {
               // even sites
-              /*
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-              sse_set_clover_PRECISION( Aee_vectorized, sc_in );
-#ifdef HAVE_TM
-              sse_add_diagonal_clover_PRECISION( Aee_vectorized, tm_term_in );
-#endif
-              Aee_vectorized += 2*2*36;
-#endif
-              */
 #ifndef HAVE_TM
               selfcoupling_cholesky_decomposition_PRECISION( Aee, sc_in );
 #else
@@ -754,37 +671,12 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
   MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, lu_doublet_dec_size*n );
   Aee = op->clover_doublet_oo_inv;
   Aoo = op->clover_doublet_oo_inv + op->num_even_sites*lu_doublet_dec_size;
-  /*
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  MALLOC_HUGEPAGES( op->clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36, 4*SIMD_LENGTH_PRECISION );
-  MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, op->num_odd_sites*2*2*144, 4*SIMD_LENGTH_PRECISION );
-  PRECISION *Aee_vectorized = op->clover_doublet_vectorized;
-  PRECISION *Aoo_vectorized = op->clover_doublet_vectorized + op->num_even_sites*288;
-  PRECISION *Aoo_inverse_vectorized = op->clover_doublet_oo_inv_vectorized;
-#endif
-  */
   for ( t=0; t<le[T]; t++ )
     for ( z=0; z<le[Z]; z++ )
       for ( y=0; y<le[Y]; y++ )
         for ( x=0; x<le[X]; x++ ) {
           if ( (t+z+y+x+oe_offset)%2 == 1 ) {
             // odd sites
-            /*
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-            sse_set_clover_doublet_PRECISION( Aoo_vectorized, sc_in );
-#ifdef HAVE_TM
-            if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
-              sse_add_diagonal_clover_doublet_PRECISION( Aoo_vectorized, tm_term_in );
-#endif
-            complex_PRECISION eps_term[12];
-            for(int i=0; i<12; i++) {
-              eps_term[i] = eps_term_in[i];
-            }
-            sse_site_clover_doublet_invert_PRECISION( Aoo_vectorized, (config_PRECISION) eps_term, Aoo_inverse_vectorized );
-            Aoo_vectorized += 288;
-            Aoo_inverse_vectorized += 2*288;
-#endif
-            */
             complex_double buffer[66];
             if ( g.csw ) {
               for(int i=0; i<12; i++) //0-23
@@ -810,15 +702,6 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
             Aoo += lu_doublet_dec_size;
           } else {
             // even sites
-            /*
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-            sse_set_clover_doublet_PRECISION( Aee_vectorized, sc_in );
-#ifdef HAVE_TM
-            sse_add_diagonal_clover_doublet_PRECISION( Aee_vectorized, tm_term_in );
-#endif
-            Aee_vectorized += 288;
-#endif
-            */
             complex_double buffer[66];
             if ( g.csw ) {
               for(int i=0; i<12; i++) //0-23
@@ -871,19 +754,6 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
           k++;
         }
        
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // D_vectorized just used in the float environment
-  MALLOC_HUGEPAGES( op->D_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
-  MALLOC_HUGEPAGES( op->D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
-  for ( int i=0; i<l->num_inner_lattice_sites; i++ ) {
-    PRECISION *D_vectorized = op->D_vectorized + 96*i;
-    PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i;
-    complex_PRECISION *D_out_pt = op->D + 36*i;
-    for ( int mu=0; mu<4; mu++ ) {
-      set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu );
-    }
-  }
-#endif
-  
   // define data layout
   MALLOC( op->index_table, int, N[T]*N[Z]*N[Y]*N[X] );
   eot = op->index_table;
@@ -922,15 +792,15 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
   MALLOC( op->prnT, complex_PRECISION, j*8 );
   op->prnZ = op->prnT + j; op->prnY = op->prnZ + j; op->prnX = op->prnY + j;
   op->prpT = op->prnX + j; op->prpZ = op->prpT + j; op->prpY = op->prpZ + j; op->prpX = op->prpY + j;  
-  MALLOC( op->buffer, complex_PRECISION*, 2 );
-  op->buffer[0] = NULL;
+  MALLOC( op->buffer, vector_PRECISION, 2 );
+  for(int i=0; i<2; i++ ){
+    vector_PRECISION_init( &(op->buffer[i]) );
 #ifdef HAVE_TM1p1
-  MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size );
-  op->buffer[1] = op->buffer[0] + 2*l->vector_size;  
+    vector_PRECISION_alloc( &(op->buffer[i]), _ORDINARY, 2, l, no_threading );
 #else
-  MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size );
-  op->buffer[1] = op->buffer[0] + l->vector_size;  
+    vector_PRECISION_alloc( &(op->buffer[i]), _ORDINARY, 1, l, no_threading );
 #endif
+  }
   ghost_alloc_PRECISION( 0, &(op->c), l );
   ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ;
   l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var;
@@ -945,18 +815,6 @@ void oddeven_free_PRECISION( level_struct *l ) {
   lu_dec_size = 72;
 #endif
       
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, PRECISION, 2*4*l->inner_vector_size );
-  FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size );
-#endif
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36 );
-#ifdef HAVE_TM1p1
-  FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36 );
-  FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_oo_inv_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*144 );
-#endif
-#endif
-  
   ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l );
   FREE( l->oe_op_PRECISION.D, complex_PRECISION, 4*nc_size*n );
   if ( g.csw )
@@ -977,13 +835,15 @@ void oddeven_free_PRECISION( level_struct *l ) {
     FREE( l->oe_op_PRECISION.c.boundary_table[2*mu], int, bs );
     l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL;
   }
-  
+
+  for(int i=0; i<2; i++ ){
 #ifdef HAVE_TM1p1
-  FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 4*l->vector_size );
+    vector_PRECISION_free( &(l->oe_op_PRECISION.buffer[i]), l, no_threading );
 #else
-  FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size );
+    vector_PRECISION_free( &(l->oe_op_PRECISION.buffer[i]), l, no_threading );
 #endif
-  FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 );
+  }
+  FREE( l->oe_op_PRECISION.buffer, vector_PRECISION, 2 );
 #ifdef HAVE_TM1p1
   FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, 2*(l->num_lattice_site_var/2)*l->num_lattice_sites*8 );
   FREE( l->oe_op_PRECISION.clover_doublet_oo_inv, complex_PRECISION, 288*n );
@@ -993,7 +853,7 @@ void oddeven_free_PRECISION( level_struct *l ) {
 }
 
 
-void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_struct *l, struct Thread *threading ) {
+void oddeven_to_serial_PRECISION( vector_double *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
 * Translates a vector from an odd even PRECISION precision layout to a serial 
@@ -1011,7 +871,7 @@ void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_
   for ( i=start; i<end; i++ ) {
     k = tt[i];
     for ( j=0; j<nsv; j++ ) {
-      out[i*nsv+j] = (complex_double) in[k*nsv+j];
+      out->vector_buffer[i*nsv+j] = (complex_double) in->vector_buffer[k*nsv+j];
     }
   }
   END_NO_HYPERTHREADS(threading)
@@ -1019,7 +879,7 @@ void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_
 }
 
 
-void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_struct *l, struct Thread *threading ) {
+void serial_to_oddeven_PRECISION( vector_PRECISION *out, vector_double *in, level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
 * Translates a vector from a serial double precision layout to an odd even
@@ -1037,7 +897,7 @@ void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_
   for ( i=start; i<end; i++ ) {
     k = tt[i];
     for ( j=0; j<nsv; j++ ) {
-      out[k*nsv+j] = (complex_PRECISION) in[i*nsv+j];
+      out->vector_buffer[k*nsv+j] = (complex_PRECISION) in->vector_buffer[i*nsv+j];
     }
   }
   END_NO_HYPERTHREADS(threading)
@@ -1045,7 +905,7 @@ void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_
 }
 
 
-void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) {
+void oddeven_to_block_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) {
 
   int i, j, k, m,
       nsv = l->num_lattice_site_var, *tt_oe = l->oe_op_PRECISION.translation_table,
@@ -1059,7 +919,7 @@ void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, leve
   for ( i=start; i<end; i++ ) {
     k = tt_oe[i]; m = tt_b[i];
     for ( j=0; j<nsv; j++ ) {
-      out[m*nsv+j] = in[k*nsv+j];
+      out->vector_buffer[m*nsv+j] = in->vector_buffer[k*nsv+j];
     }
   }
   END_NO_HYPERTHREADS(threading)
@@ -1067,7 +927,7 @@ void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, leve
 }
 
 
-void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading ) {
+void block_to_oddeven_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading ) {
 
   int i, j, k, m,
       nsv = l->num_lattice_site_var, *tt_oe = l->oe_op_PRECISION.translation_table,
@@ -1081,14 +941,14 @@ void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, leve
   for ( i=start; i<end; i++ ) {
     k = tt_oe[i]; m = tt_b[i];
     for ( j=0; j<nsv; j++ ) {
-      out[k*nsv+j] = in[m*nsv+j];
+      out->vector_buffer[k*nsv+j] = in->vector_buffer[m*nsv+j];
     }
   }
   END_NO_HYPERTHREADS(threading)
   SYNC_CORES(threading)  
 }
 
-void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
+void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op,
                              const int amount, level_struct *l, struct Thread *threading ) {
 
   int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites,
@@ -1115,29 +975,20 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     plus_dir_param = _ODD_SITES;
   }
 
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX };
-  complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX };
-#else
   int i, *nb_pt;
-  vector_PRECISION phi_pt, eta_pt, end_pt;
+  buffer_PRECISION phi_pt, eta_pt, end_pt;
   config_PRECISION D_pt;
-#endif
   
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2 ) {
     // project in negative directions
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    dprp_PRECISION( prn, phi, 24*start, 24*n );  
-#else
     complex_PRECISION pbuf[12];
-    for ( i=12*start, phi_pt=phi+24*start; i<12*n; i+=12, phi_pt+=24 ) {
+    for ( i=12*start, phi_pt=phi->vector_buffer+24*start; i<12*n; i+=12, phi_pt+=24 ) {
       dprp_T_PRECISION( op->prnT+i, phi_pt );
       dprp_Z_PRECISION( op->prnZ+i, phi_pt );
       dprp_Y_PRECISION( op->prnY+i, phi_pt );
       dprp_X_PRECISION( op->prnX+i, phi_pt );
     }
-#endif
     // start communication in negative direction
     START_LOCKED_MASTER(threading)
     ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
@@ -1146,10 +997,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
     END_LOCKED_MASTER(threading) 
     // project plus dir and multiply with U dagger
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    dprn_su3_PRECISION( prp, phi, op, neighbor, 24*start, 24*n );
-#else
-    for ( phi_pt=phi+24*start, end_pt=phi+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=24 ) {
+    for ( phi_pt=phi->vector_buffer+24*start, end_pt=phi->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=24 ) {
       // T dir
       i = 12*(*nb_pt); nb_pt++;
       dprn_T_PRECISION( pbuf, phi_pt );
@@ -1179,7 +1027,6 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
       mvmh_PRECISION( op->prpX+i+6, D_pt, pbuf+6 );
       mvmh_PRECISION( op->prpX+i+9, D_pt, pbuf+9 ); D_pt += 9;
     }
-#endif
     if ( amount == _EVEN_SITES ) {
       start = start_even, n = end_even;
     } else if ( amount == _ODD_SITES ) {
@@ -1198,10 +1045,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
     END_LOCKED_MASTER(threading) 
     // multiply with U and lift up minus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    su3_dpbp_PRECISION( eta, prn, op, neighbor, 24*start, 24*n );
-#else
-    for ( eta_pt=eta+24*start, end_pt=eta+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=24 ) {
+    for ( eta_pt=eta->vector_buffer+24*start, end_pt=eta->vector_buffer+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=24 ) {
       // T dir
       i = 12*(*nb_pt); nb_pt++;
       mvm_PRECISION( pbuf, D_pt, op->prnT+i );
@@ -1231,7 +1075,6 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
       mvm_PRECISION( pbuf+9, D_pt, op->prnX+i+9 );
       dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
     }
-#endif
     // wait for communication in positive direction
     START_LOCKED_MASTER(threading)
     ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
@@ -1240,30 +1083,22 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
     END_LOCKED_MASTER(threading) 
     // lift up plus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    dpbn_PRECISION( eta, prp, 24*start, 24*n );
-#else
-    for ( i=12*start, eta_pt=eta+24*start; i<12*n; i+=12, eta_pt+=24 ) {
+    for ( i=12*start, eta_pt=eta->vector_buffer+24*start; i<12*n; i+=12, eta_pt+=24 ) {
       dpbn_su3_T_PRECISION( op->prpT+i, eta_pt );
       dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
       dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
       dpbn_su3_X_PRECISION( op->prpX+i, eta_pt );
     }
-#endif
   } else {
 #endif
     // project in negative directions
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    prp_PRECISION( prn, phi, 12*start, 12*n );  
-#else
     complex_PRECISION pbuf[6];
-    for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) {
+    for ( i=6*start, phi_pt=phi->vector_buffer+12*start; i<6*n; i+=6, phi_pt+=12 ) {
       prp_T_PRECISION( op->prnT+i, phi_pt );
       prp_Z_PRECISION( op->prnZ+i, phi_pt );
       prp_Y_PRECISION( op->prnY+i, phi_pt );
       prp_X_PRECISION( op->prnX+i, phi_pt );
     }
-#endif
     // start communication in negative direction
     START_LOCKED_MASTER(threading)
     ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
@@ -1272,10 +1107,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
     END_LOCKED_MASTER(threading) 
     // project plus dir and multiply with U dagger
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n );
-#else
-    for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=12 ) {
+    for ( phi_pt=phi->vector_buffer+12*start, end_pt=phi->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=12 ) {
       // T dir
       i = 6*(*nb_pt); nb_pt++;
       prn_T_PRECISION( pbuf, phi_pt );
@@ -1297,7 +1129,6 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
       mvmh_PRECISION( op->prpX+i, D_pt, pbuf );
       mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9;
     }
-#endif
     if ( amount == _EVEN_SITES ) {
       start = start_even, n = end_even;
     } else if ( amount == _ODD_SITES ) {
@@ -1316,10 +1147,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
     END_LOCKED_MASTER(threading) 
     // multiply with U and lift up minus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    su3_pbp_PRECISION( eta, prn, op, neighbor, 12*start, 12*n );
-#else
-    for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=12 ) {
+    for ( eta_pt=eta->vector_buffer+12*start, end_pt=eta->vector_buffer+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=12 ) {
       // T dir
       i = 6*(*nb_pt); nb_pt++;
       mvm_PRECISION( pbuf, D_pt, op->prnT+i );
@@ -1341,7 +1169,6 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
       mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 );
       pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
     }
-#endif
     // wait for communication in positive direction
     START_LOCKED_MASTER(threading)
     ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
@@ -1350,16 +1177,12 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
     END_LOCKED_MASTER(threading) 
     // lift up plus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-    pbn_PRECISION( eta, prp, 12*start, 12*n );
-#else
-    for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) {
+    for ( i=6*start, eta_pt=eta->vector_buffer+12*start; i<6*n; i+=6, eta_pt+=12 ) {
       pbn_su3_T_PRECISION( op->prpT+i, eta_pt );
       pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
       pbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
       pbn_su3_X_PRECISION( op->prpX+i, eta_pt );
     }
-#endif
 #ifdef HAVE_TM1p1
   }
 #endif
@@ -1367,7 +1190,7 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
   SYNC_CORES(threading)
 }
 
-void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
+void apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op,
     level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
@@ -1383,8 +1206,8 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in
   vector_PRECISION *tmp = op->buffer;
   
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l );
-  vector_PRECISION_define( tmp[0], 0, start_even, end_even, l );
+  vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l );
+  vector_PRECISION_define( &tmp[0], 0, start_even, end_even, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
   
@@ -1392,17 +1215,17 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in
   diag_ee_PRECISION( out, in, op, l, start_even, end_even );
   SYNC_CORES(threading)
   PROF_PRECISION_STOP( _SC, 1, threading );
-  hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd );
+  diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, start_odd, end_odd );
   SYNC_CORES(threading)
   PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
-  hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading );
+  hopping_term_PRECISION( &tmp[0], &tmp[1], op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
-  vector_PRECISION_minus( out, out, tmp[0], start_even, end_even, l );
+  vector_PRECISION_minus( out, out, &tmp[0], start_even, end_even, l );
 }
 
 
@@ -1417,80 +1240,68 @@ void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_stru
   
   // odd to even
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp, p->b, op, l, start, end );
+  diag_oo_inv_PRECISION( &tmp, &(p->b), op, l, start, end );
   PROF_PRECISION_STOP( _SC, 0, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_scale( tmp, tmp, -1, start, end, l );
+  vector_PRECISION_scale( &tmp, &tmp, -1, start, end, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  hopping_term_PRECISION( p->b, tmp, op, _EVEN_SITES, l, threading );
+  hopping_term_PRECISION( &(p->b), &tmp, op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   
   if ( g.method == 4 )
     fgmres_PRECISION( p, l, threading );
   else if ( g.method == 5 )
     bicgstab_PRECISION( p, l, threading );
-  diag_oo_inv_PRECISION( p->x, p->b, op, l, start, end );
+  diag_oo_inv_PRECISION( &(p->x), &(p->b), op, l, start, end );
   
   // even to odd
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp, 0, start, end, l );
+  vector_PRECISION_define( &tmp, 0, start, end, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading );
+  hopping_term_PRECISION( &tmp, &(p->x), op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( p->b, tmp, op, l, start, end );
+  diag_oo_inv_PRECISION( &(p->b), &tmp, op, l, start, end );
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_minus( p->x, p->x, p->b, start, end, l );
+  vector_PRECISION_minus( &(p->x), &(p->x), &(p->b), start, end, l );
   SYNC_CORES(threading)
 }
 
 
-void g5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) {
-  if ( eta != phi ) { 
-    vector_PRECISION eta_end = eta + end;
-    eta += start;
-    phi += start;
-    while ( eta < eta_end ) {
-      FOR6( *eta = -(*phi); phi++; eta++; )
-      FOR6( *eta =  (*phi); phi++; eta++; )
+void g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) {
+  if ( eta->vector_buffer != phi->vector_buffer ) { 
+    for( int i = start; i < end; ) {
+      FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; )
+      FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; )
     }
   } else {
-    vector_PRECISION eta_end = eta + end;
-    eta += start;
-    phi += start;
-    while ( eta < eta_end ) {
-      FOR6( *eta = -(*phi); phi++; eta++; )
-      eta+=6; phi+=6;
+    for ( int i = start; i < end; ) {
+      FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; )
+      i+=6;
     }
   }
 }
 
 
-void minus_g5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) {
-  if ( eta != phi ) { 
-    vector_PRECISION eta_end = eta + end;
-    eta += start;
-    phi += start;
-    while ( eta < eta_end ) {
-      FOR6( *eta =  (*phi); phi++; eta++; )
-      FOR6( *eta = -(*phi); phi++; eta++; )
+void minus_g5_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, int end, level_struct *l ) {
+  if ( eta->vector_buffer != phi->vector_buffer ) { 
+    for ( int i = start; i < end; ) {
+      FOR6( eta->vector_buffer[i] = phi->vector_buffer[i]; i++; )
+      FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; )
     }
   } else {
-    vector_PRECISION eta_end = eta + end;
-    eta += start;
-    phi += start;
-    while ( eta < eta_end ) {
-      eta+=6; phi+=6;
-      FOR6( *eta = -(*phi); phi++; eta++; )
+    for ( int i = start; i < end;  ) {
+      i+=6;
+      FOR6( eta->vector_buffer[i] = -phi->vector_buffer[i]; i++; )
     }
   }
 }
 
 
-void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void g5D_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
 * Applies the Schur complement to a vector.
@@ -1502,10 +1313,14 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO
   compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var );
   
   vector_PRECISION *tmp = op->buffer;
+
+ // vector_PRECISION **tmp;
+ // *tmp->vector_buffer = op->buffer->vector_buffer;
   
+
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l );
-  vector_PRECISION_define( tmp[0], 0, start_even, end_even, l );
+  vector_PRECISION_define( &tmp[0], 0, start_odd, end_odd, l );
+  vector_PRECISION_define( &tmp[0], 0, start_even, end_even, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
   
@@ -1513,17 +1328,17 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO
   diag_ee_PRECISION( out, in, op, l, start_even, end_even );
   SYNC_CORES(threading)
   PROF_PRECISION_STOP( _SC, 1, threading );
-  hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  hopping_term_PRECISION( &tmp[0], in, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd );
+  diag_oo_inv_PRECISION( &tmp[1], &tmp[0], op, l, start_odd, end_odd );
   SYNC_CORES(threading)
   PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
-  hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading );
+  hopping_term_PRECISION( &tmp[0], &tmp[1], op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
-  vector_PRECISION_minus( out, out, tmp[0], start_even, end_even, l );
+  vector_PRECISION_minus( out, out, &tmp[0], start_even, end_even, l );
   SYNC_CORES(threading)
   g5_PRECISION( out, out, start_even, end_even, l );
 //   g5_PRECISION( out, out, start_odd, end_odd, l );
@@ -1541,40 +1356,40 @@ void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_
   
   // odd to even
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp, p->b, op, l, start_odd, end_odd );
+  diag_oo_inv_PRECISION( &tmp, &(p->b), op, l, start_odd, end_odd );
   PROF_PRECISION_STOP( _SC, 0, threading );
   SYNC_CORES(threading)
 //   g5_PRECISION( tmp, tmp, start_odd, end_odd, l );
 //   vector_PRECISION_scale( tmp, tmp, -1, start_odd, end_odd, l );
-  minus_g5_PRECISION( tmp, tmp, start_odd, end_odd, l );
+  minus_g5_PRECISION( &tmp, &tmp, start_odd, end_odd, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  vector_PRECISION_define( p->x, 0, start_even, end_even, l );
-  hopping_term_PRECISION( p->x, tmp, op, _EVEN_SITES, l, threading );
+  vector_PRECISION_define( &(p->x), 0, start_even, end_even, l );
+  hopping_term_PRECISION( &(p->x), &tmp, op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   SYNC_CORES(threading)
-  g5_PRECISION( p->x, p->x, start_even, end_even, l );
-  vector_PRECISION_plus( p->b, p->b, p->x, start_even, end_even, l );
+  g5_PRECISION( &(p->x), &(p->x), start_even, end_even, l );
+  vector_PRECISION_plus( &(p->b), &(p->b), &(p->x), start_even, end_even, l );
   SYNC_CORES(threading)
   
   ASSERT( g.method == 6 );
   fgmres_PRECISION( p, l, threading );
-  diag_oo_inv_PRECISION( p->x, p->b, op, l, start_odd, end_odd );
-  g5_PRECISION( p->x, p->x, start_odd, end_odd, l );
+  diag_oo_inv_PRECISION( &(p->x), &(p->b), op, l, start_odd, end_odd );
+  g5_PRECISION( &(p->x), &(p->x), start_odd, end_odd, l );
   
   // even to odd
   SYNC_CORES(threading)
-  vector_PRECISION_define( tmp, 0, start_odd, end_odd, l );
+  vector_PRECISION_define( &tmp, 0, start_odd, end_odd, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading );
+  hopping_term_PRECISION( &tmp, &(p->x), op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( p->b, tmp, op, l, start_odd, end_odd );
+  diag_oo_inv_PRECISION( &(p->b), &tmp, op, l, start_odd, end_odd );
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_minus( p->x, p->x, p->b, start_odd, end_odd, l );
+  vector_PRECISION_minus( &(p->x), &(p->x), &(p->b), start_odd, end_odd, l );
   SYNC_CORES(threading)
 }
 
@@ -1596,14 +1411,9 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct
   }
   
   if ( g.csw ) {
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
     config_PRECISION clover_pt = op->clover, clover_oo_inv_pt = op->clover_oo_inv;
     complex_double buffer[42];
     int cs = 42;
-#else
-    PRECISION *clover_pt = op->clover_vectorized, *clover_oo_inv_pt = op->clover_oo_inv_vectorized;
-    int cs = 144;
-#endif    
     for ( d0=0; d0<agg_split[T]; d0++ )
       for ( c0=0; c0<agg_split[Z]; c0++ )
         for ( b0=0; b0<agg_split[Y]; b0++ )
@@ -1625,8 +1435,6 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct
                           for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
                             if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
                                 (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) {
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-
                               for ( i=0; i<42; i++ )
                                 buffer[i] = (complex_double)clover_pt[i];
 #ifdef HAVE_TM
@@ -1640,11 +1448,6 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct
                               selfcoupling_cholesky_decomposition_PRECISION( clover_oo_inv_pt, buffer );
                               clover_oo_inv_pt += 42;
 #endif
-
-#else
-                              sse_site_clover_invert_PRECISION( clover_pt, clover_oo_inv_pt );
-                              clover_oo_inv_pt += 144;
-#endif
                               clover_pt += cs;
 
                             }
@@ -1653,14 +1456,9 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct
   }
 
 #ifdef HAVE_TM1p1
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
   complex_double buffer[66];
   config_PRECISION clover_oo_inv_pt = op->clover_doublet_oo_inv, clover_pt = op->clover;
   int cs = g.csw ? 42:12;
-#else
-  PRECISION *clover_pt = g.csw ? op->clover_doublet_vectorized:(PRECISION*)op->clover, *clover_oo_inv_pt = op->clover_doublet_oo_inv_vectorized;
-  int cs = g.csw ? 288:24;
-#endif
   config_PRECISION eps_term_pt = op->epsbar_term;
 #ifdef HAVE_TM
   tm_term_pt = op->tm_term;
@@ -1688,8 +1486,6 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct
                         for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
                           if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
                                (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) {
-
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
                             if ( g.csw ) {
                               for( i=0; i<12; i++ ) //0-23
                                 buffer[i+12] = buffer[i] = (complex_double) clover_pt[i];
@@ -1715,48 +1511,13 @@ void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct
                             clover_pt += cs;
                             selfcoupling_LU_doublet_decomposition_PRECISION( clover_oo_inv_pt, buffer );
                             clover_oo_inv_pt += 288;
-#else
-                            if ( g.csw ) {
-                              sse_site_clover_doublet_invert_PRECISION( clover_pt, eps_term_pt, clover_oo_inv_pt );
-                            } else {
-#ifdef HAVE_TM
-                              for ( i=0; i<6; i++ ) { //we temporaly save in clover_oo_inv_pt
-                                clover_oo_inv_pt[2*i]    = clover_pt[2*i]   + creal_PRECISION(tm_term_pt[i]); 
-                                clover_oo_inv_pt[2*i+1]  = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); 
-                                clover_oo_inv_pt[2*i+12] = clover_pt[2*i]   - creal_PRECISION(tm_term_pt[i]); 
-                                clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); 
-                              }
-                              for ( i=6; i<12; i++ ) {
-                                clover_oo_inv_pt[2*i+12] = clover_pt[2*i]   + creal_PRECISION(tm_term_pt[i]); 
-                                clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); 
-                                clover_oo_inv_pt[2*i+24] = clover_pt[2*i]   - creal_PRECISION(tm_term_pt[i]); 
-                                clover_oo_inv_pt[2*i+25] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); 
-                              }
-                              tm_term_pt += 12;
-#else
-                              for ( i=0; i<6; i++ ) {
-                                clover_oo_inv_pt[2*i+12] = clover_oo_inv_pt[2*i]   = clover_pt[2*i]; 
-                                clover_oo_inv_pt[2*i+13] = clover_oo_inv_pt[2*i+1] = clover_pt[2*i+1]; 
-                              }
-                              for ( i=6; i<12; i++ ) {
-                                clover_oo_inv_pt[2*i+24] = clover_oo_inv_pt[2*i+12] = clover_pt[2*i]; 
-                                clover_oo_inv_pt[2*i+25] = clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1]; 
-                              }
-#endif
-                              sse_site_clover_doublet_invert_PRECISION( clover_oo_inv_pt, eps_term_pt, clover_oo_inv_pt );
-                            } 
-                            
-                            clover_pt += cs;
-                            eps_term_pt += 12;
-                            clover_oo_inv_pt += 2*288;
-#endif
                           }
                         }
                 }
 #endif  
 }
 
-void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+void block_diag_ee_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)  
@@ -1767,69 +1528,51 @@ void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
 }
                       
 // diagonal blocks applied to the odd sites of a block
-void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
 
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  //we don't have the LU decomposition here, for debugging only  
-  int n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites, nv = l->num_lattice_site_var;
-  clover_PRECISION( eta, phi, &(s->op), start+nv*n1, start+nv*(n1+n2), l, threading ); 
-
-#else
-
   int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites;
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
     int block_num = start/24/(n1+n2);
     //    config_PRECISION clover = s->op.clover_doublet_oo_inv+n1*288+(start/24)*288;
-    config_PRECISION clover = s->op.clover_doublet_oo_inv+(start/24-block_num*n1)*288;
-    vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start;
-    for ( i=0; i<n2; i++ ) {
-      LU_multiply_PRECISION( leta, lphi, clover );
-      leta+=24; lphi+=24; clover+=288;
-    }
+    config_PRECISION clover = s->op.clover_doublet_oo_inv-(block_num+1)*n1*288;
+    LU_multiply_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start );
   } else {
 #endif
-    vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start;
     if ( g.csw ) {
       int block_num = start/12/(n1+n2);
 #ifndef HAVE_TM
-      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42;
-      for ( i=0; i<n2; i++ ) {
-        LLH_multiply_PRECISION( leta, lphi, clover );
-        leta+=12; lphi+=12; clover+=42;
-      }
+      config_PRECISION clover = s->op.clover_oo_inv-(block_num+1)*n1*42;
+      LLH_multiply_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start );
 #else
-      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72;
-      for ( i=0; i<n2; i++ ) {
-        LU_multiply_PRECISION( leta, lphi, clover );
-        leta+=12; lphi+=12; clover+=72;
-      }
+      config_PRECISION clover = s->op.clover_oo_inv-(block_num+1)*n1*72;
+      LU_multiply_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start );
 #endif
     } else {
+      vector_PRECISION lphi, leta;
+      lphi.vector_buffer = phi->vector_buffer+n1*12+start;
+      leta.vector_buffer = eta->vector_buffer+n1*12+start;
       config_PRECISION clover = s->op.clover+n1*12+start;
 #ifndef HAVE_TM
       for ( i=0; i<12*n2; i++ )
-        leta[i] = lphi[i]*(clover[i]);
+        leta.vector_buffer[i] = lphi.vector_buffer[i]*(clover[i]);
 #else
       config_PRECISION tm_term = s->op.tm_term+n1*12+start;
       for ( i=0; i<12*n2; i++ )
-        leta[i] = lphi[i]*(clover[i]+tm_term[i]);
+        leta.vector_buffer[i] = lphi.vector_buffer[i]*(clover[i]+tm_term[i]);
 #endif
     }
 #ifdef HAVE_TM1p1
   }
 #endif  
-
-#endif
   END_UNTHREADED_FUNCTION(threading)
 }
 
 // inverted diagonal blocks applied to the odd sites of a block
-void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s,
+void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, int start, schwarz_PRECISION_struct *s,
                                   level_struct *l, struct Thread *threading ) {
 
   START_UNTHREADED_FUNCTION(threading)
@@ -1839,58 +1582,33 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, in
 #ifdef HAVE_TM1p1
   if ( g.n_flavours == 2 ) {
 
-    vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start;
     int block_num = start/24/(n1+n2);
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-    config_PRECISION clover = s->op.clover_doublet_oo_inv + (start/24-block_num*n1)*288;
-    for ( i=0; i<n2; i++ ) {
-      LU_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
-      leta+=24; lphi+=24; clover+=288;
-    }
-#else
-    PRECISION *clover_vectorized = s->op.clover_doublet_oo_inv_vectorized + (start/24-block_num*n1)*2*288;
-    for ( i=0; i<n2; i++ ) {
-      sse_site_clover_doublet_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover_vectorized );
-      leta+=24; lphi+=24; clover_vectorized+=2*288;
-    }
-#endif
-
+    config_PRECISION clover = s->op.clover_doublet_oo_inv-(block_num+1)*n1*288;
+    LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*24+start, (n1+n2)*24+start );
   } else {
 #endif
 
-    vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start;
     if ( g.csw ) {
       int block_num = start/12/(n1+n2);
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
 #ifndef HAVE_TM
-      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42;
-      for ( i=0; i<n2; i++ ) {
-        LLH_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
-        leta+=12; lphi+=12; clover+=42;
-      }
-#else
-      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72;
-      for ( i=0; i<n2; i++ ) {
-        LU_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
-        leta+=12; lphi+=12; clover+=72;
-      }
-#endif
+      config_PRECISION clover = s->op.clover_oo_inv-(block_num+1)*n1*42;
+      LLH_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start );
 #else
-      PRECISION *clover_vectorized = s->op.clover_oo_inv_vectorized + (start/12-block_num*n1)*144;
-      for ( i=0; i<n2; i++ ) {
-        sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover_vectorized );
-        leta+=12; lphi+=12; clover_vectorized+=144;
-      }      
+      config_PRECISION clover = s->op.clover_oo_inv-(block_num+1)*n1*72;
+      LU_perform_fwd_bwd_subs_PRECISION( eta, phi, clover, n1*12+start, (n1+n2)*12+start );
 #endif
     } else {
       config_PRECISION clover = s->op.clover+n1*12+start;
+      vector_PRECISION lphi, leta;
+      lphi.vector_buffer = phi->vector_buffer+n1*12+start;
+      leta.vector_buffer = eta->vector_buffer+n1*12+start;
 #ifndef HAVE_TM
       for ( i=0; i<12*n2; i++ )
-        leta[i] = lphi[i]/(clover[i]);
+        leta.vector_buffer[i] = lphi.vector_buffer[i]/(clover[i]);
 #else
       config_PRECISION tm_term = s->op.tm_term+n1*12+start;
       for ( i=0; i<12*n2; i++ )
-        leta[i] = lphi[i]/(clover[i]+tm_term[i]);
+        leta.vector_buffer[i] = lphi.vector_buffer[i]/(clover[i]+tm_term[i]);
 #endif
     }
 #ifdef HAVE_TM1p1
@@ -1901,7 +1619,7 @@ void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, in
 }
 
 
-void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
     int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
@@ -1909,30 +1627,10 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
   int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
       **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var;
   
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96;
-  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96;
-
-  for ( int mu=0; mu<4; mu++ ) {
-    if ( amount == _EVEN_SITES ) {
-      a1 = 0; n1 = length_even[mu];
-      a2 = n1; n2 = a2 + length_odd[mu];
-    } else if ( amount == _ODD_SITES ) {
-      a1 = length_even[mu]; n1 = a1 + length_odd[mu];
-      a2 = 0; n2 = a1;
-    } else {
-      a1 = 0; n1 = length_even[mu]+length_odd[mu];
-      a2 = 0; n2 = n1;
-    }
-    block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor );
-    block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor );
-  }
-
-#else
   config_PRECISION D = s->op.D + (start/nv)*36;
   int i, j, k, *ind;
   config_PRECISION D_pt; 
-  vector_PRECISION lphi = phi+start, leta = eta+start;
+  buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start;
 
 #ifdef HAVE_TM1p1  
   if ( g.n_flavours == 2 ) {
@@ -2188,13 +1886,12 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
   }
 #ifdef HAVE_TM1p1
   }
-#endif
 #endif
   END_UNTHREADED_FUNCTION(threading)
 }
 
 
-void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
     int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
@@ -2202,28 +1899,8 @@ void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
   int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
       **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var;
 
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96;
-  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96;
-
-  for ( int mu=0; mu<4; mu++ ) {
-    if ( amount == _EVEN_SITES ) {
-      a1 = 0; n1 = length_even[mu];
-      a2 = n1; n2 = a2 + length_odd[mu];
-    } else if ( amount == _ODD_SITES ) {
-      a1 = length_even[mu]; n1 = a1 + length_odd[mu];
-      a2 = 0; n2 = a1;
-    } else {
-      a1 = 0; n1 = length_even[mu]+length_odd[mu];
-      a2 = 0; n2 = n1;
-    }
-    block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor );
-    block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor );
-  }
-
-#else
   int i, j, k, *ind;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
+  buffer_PRECISION lphi = phi->vector_buffer+start, leta = eta->vector_buffer+start;
   config_PRECISION D_pt, D = s->op.D + (start/nv)*36;
 
 #ifdef HAVE_TM1p1
@@ -2482,27 +2159,26 @@ void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
 #ifdef HAVE_TM1p1
   }
 #endif
-#endif      
   END_UNTHREADED_FUNCTION(threading)
 }
 
 
-void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start,
+void apply_block_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, int start,
     schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   vector_PRECISION *tmp = s->oe_buf;
   
   block_diag_ee_PRECISION( out, in, start, s, l, threading );
   START_LOCKED_MASTER(threading)
-  vector_PRECISION_define( tmp[0], 0, start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l );
+  vector_PRECISION_define( &tmp[0], 0, start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l );
   END_LOCKED_MASTER(threading)
-  block_hopping_term_PRECISION( tmp[0], in, start, _ODD_SITES, s, l, threading );
-  block_diag_oo_inv_PRECISION( tmp[1], tmp[0], start, s, l, threading );
-  block_n_hopping_term_PRECISION( out, tmp[1], start, _EVEN_SITES, s, l, threading );
+  block_hopping_term_PRECISION( &tmp[0], in, start, _ODD_SITES, s, l, threading );
+  block_diag_oo_inv_PRECISION( &tmp[1], &tmp[0], start, s, l, threading );
+  block_n_hopping_term_PRECISION( out, &tmp[1], start, _EVEN_SITES, s, l, threading );
 }
 
 
-void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, vector_PRECISION latest_iter,
+void block_solve_oddeven_PRECISION( vector_PRECISION *phi, vector_PRECISION *r, vector_PRECISION *latest_iter,
     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
@@ -2511,21 +2187,19 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve
   int end = start+s->block_vector_size;
   
   // odd to even
-  vector_PRECISION_copy( tmp[3], r, start, end, l );
-  block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading );
-  block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _EVEN_SITES, s, l, no_threading );
-  
-  local_minres_PRECISION( NULL, tmp[3], tmp[2], start, s, l, no_threading );
-    
+  vector_PRECISION_copy( &tmp[3], r, start, end, l );
+  block_diag_oo_inv_PRECISION( &tmp[2], &tmp[3], start, s, l, no_threading );
+  block_n_hopping_term_PRECISION( &tmp[3], &tmp[2], start, _EVEN_SITES, s, l, no_threading );
+  local_minres_PRECISION( NULL, &tmp[3], &tmp[2], start, s, l, no_threading );  
   // even to odd
-  block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _ODD_SITES, s, l, no_threading );
-  block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading );
+  block_n_hopping_term_PRECISION( &tmp[3], &tmp[2], start, _ODD_SITES, s, l, no_threading );
+  block_diag_oo_inv_PRECISION( &tmp[2], &tmp[3], start, s, l, no_threading );
   
   // update phi, latest_iter
-  vector_PRECISION_copy( latest_iter, tmp[2], start, end, l );
-  vector_PRECISION_plus( phi, phi, tmp[2], start, end, l );
+  vector_PRECISION_copy( latest_iter, &tmp[2], start, end, l );
+  vector_PRECISION_plus( phi, phi, &tmp[2], start, end, l );
   // update r
-  vector_PRECISION_copy( r, tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l );
+  vector_PRECISION_copy( r, &tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l );
   vector_PRECISION_define( r, 0, start+l->num_lattice_site_var*s->num_block_even_sites, end, l );
 
   END_UNTHREADED_FUNCTION(threading)
@@ -2537,55 +2211,61 @@ void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) {
 
   schwarz_PRECISION_struct *s = &(l->s_PRECISION);
   
-  vector_PRECISION b1 = NULL, b2 = NULL, b3 = NULL, b4 = NULL, b5 = NULL;
+  vector_PRECISION b1, b2, b3, b4, b5;
   PRECISION diff;
+  
+  vector_PRECISION_init(&b1);
+  vector_PRECISION_init(&b2);
+  vector_PRECISION_init(&b3);
+  vector_PRECISION_init(&b4);
+  vector_PRECISION_init(&b5);  
 
   int vs = s->block_vector_size * s->num_blocks;
 
-  MALLOC( b1, complex_PRECISION, vs );
-  MALLOC( b2, complex_PRECISION, vs );
-  MALLOC( b3, complex_PRECISION, vs );
-  MALLOC( b4, complex_PRECISION, vs );
-  MALLOC( b5, complex_PRECISION, vs );
+  MALLOC( b1.vector_buffer, complex_PRECISION, vs );
+  MALLOC( b2.vector_buffer, complex_PRECISION, vs );
+  MALLOC( b3.vector_buffer, complex_PRECISION, vs );
+  MALLOC( b4.vector_buffer, complex_PRECISION, vs );
+  MALLOC( b5.vector_buffer, complex_PRECISION, vs );
   
-  vector_PRECISION_define_random( b1, 0, vs, l );
+  vector_PRECISION_define_random( &b1, 0, vs, l );
 
   for (int i = 0; i< s->num_blocks; i++ ) {
-    block_diag_ee_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
-    block_diag_oo_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
-    block_hopping_term_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading );
+    block_diag_ee_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+    block_diag_oo_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+    block_hopping_term_PRECISION( &b2, &b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading );
     
-    block_d_plus_clover_PRECISION( b3, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+    block_d_plus_clover_PRECISION( &b3, &b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
   }
   
-  vector_PRECISION_minus( b3, b3, b2, 0, vs, l );
-  diff = global_norm_PRECISION( b3, 0, vs, l, no_threading ) / global_norm_PRECISION( b2, 0, vs, l, no_threading );
+  vector_PRECISION_minus( &b3, &b3, &b2, 0, vs, l );
+  diff = global_norm_PRECISION( &b3, 0, vs, l, no_threading ) / global_norm_PRECISION( &b2, 0, vs, l, no_threading );
   
   test0_PRECISION("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff );
   
-  vector_PRECISION_copy( b4, b1, 0, s->block_vector_size, l );
-  vector_PRECISION_define( b3, 0, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l );
+  vector_PRECISION_copy( &b4, &b1, 0, s->block_vector_size, l );
+  vector_PRECISION_define( &b3, 0, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l );
   
-  block_hopping_term_PRECISION( b3, b4, 0, _ODD_SITES, s, l, no_threading );
-  block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading );
-  vector_PRECISION_plus( b4, b4, b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l );
+  block_hopping_term_PRECISION( &b3, &b4, 0, _ODD_SITES, s, l, no_threading );
+  block_diag_oo_inv_PRECISION( &b5, &b3, 0, s, l, no_threading );
+  vector_PRECISION_plus( &b4, &b4, &b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l );
   
-  apply_block_schur_complement_PRECISION( b3, b4, 0, s, l, no_threading );
-  block_diag_oo_PRECISION( b3, b4, 0, s, l, no_threading );
+  apply_block_schur_complement_PRECISION( &b3, &b4, 0, s, l, no_threading );
+  block_diag_oo_PRECISION( &b3, &b4, 0, s, l, no_threading );
   
-  block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading );
-  block_hopping_term_PRECISION( b3, b5, 0, _EVEN_SITES, s, l, no_threading );
+  block_diag_oo_inv_PRECISION( &b5, &b3, 0, s, l, no_threading );
+  block_hopping_term_PRECISION( &b3, &b5, 0, _EVEN_SITES, s, l, no_threading );
   
-  vector_PRECISION_minus( b3, b2, b3, 0, s->block_vector_size, l );
-  diff = global_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading );
+  vector_PRECISION_minus( &b3, &b2, &b3, 0, s->block_vector_size, l );
+  diff = global_norm_PRECISION( &b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( &b2, 0, s->block_vector_size, l, no_threading );
   
   test0_PRECISION("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff );
   
-  FREE( b1, complex_PRECISION, vs );
-  FREE( b2, complex_PRECISION, vs );
-  FREE( b3, complex_PRECISION, vs );
-  FREE( b4, complex_PRECISION, vs );
-  FREE( b5, complex_PRECISION, vs );
+  FREE( b1.vector_buffer, complex_PRECISION, vs );
+  FREE( b2.vector_buffer, complex_PRECISION, vs );
+  FREE( b3.vector_buffer, complex_PRECISION, vs );
+  FREE( b4.vector_buffer, complex_PRECISION, vs );
+  FREE( b5.vector_buffer, complex_PRECISION, vs );
 
   END_UNTHREADED_FUNCTION(threading)
 }
@@ -2600,74 +2280,72 @@ void oddeven_PRECISION_test( level_struct *l ) {
 * - Compare solutions ( Difference should be close to 0 ).
 *********************************************************************************/  
   
-  vector_double d1=NULL, d2=NULL, d3=NULL;
-  vector_PRECISION f1=NULL, f2=NULL, f3=NULL, f4=NULL, f5=NULL;
+  vector_double d[3];
+  vector_PRECISION f[5];
   double diff;
   
-  MALLOC( d1, complex_double, l->inner_vector_size );
-  MALLOC( d2, complex_double, l->inner_vector_size );
-  MALLOC( d3, complex_double, l->inner_vector_size );
-  MALLOC( f1, complex_PRECISION, l->inner_vector_size );
-  MALLOC( f2, complex_PRECISION, l->inner_vector_size );
-  MALLOC( f3, complex_PRECISION, l->inner_vector_size );
-  MALLOC( f4, complex_PRECISION, l->inner_vector_size );
-  MALLOC( f5, complex_PRECISION, l->inner_vector_size );
-  
-  vector_double_define_random( d1, 0, l->inner_vector_size, l ); 
-  serial_to_oddeven_PRECISION( f1, d1, l, no_threading );
+  for(int i=0; i<3; i++){
+    vector_double_init( &d[i] );
+    vector_double_alloc( &d[i], _INNER, 1, l, no_threading );
+  }
+
+  for(int i=0; i<5; i++){                                                                 
+    vector_PRECISION_init( &f[i] );                                                          
+    vector_PRECISION_alloc( &f[i], _INNER, 1, l, no_threading );                             
+  } 
+
+  vector_double_define_random( &d[0], 0, l->inner_vector_size, l ); 
+  serial_to_oddeven_PRECISION( &f[0], &d[0], l, no_threading );
    
-  diag_ee_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var );
-  diag_oo_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, no_threading );
+  diag_ee_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var );
+  diag_oo_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), l, no_threading );
   
-  hopping_term_PRECISION( f2, f1, &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading );
+  hopping_term_PRECISION( &f[1], &f[0], &(l->oe_op_PRECISION), _FULL_SYSTEM, l, no_threading );
   
-  d_plus_clover_double( d2, d1, &(g.op_double), l, no_threading );
-  oddeven_to_serial_PRECISION( d1, f2, l, no_threading );
+  d_plus_clover_double( &d[1], &d[0], &(g.op_double), l, no_threading );
+  oddeven_to_serial_PRECISION( &d[0], &f[1], l, no_threading );
   
-  vector_double_minus( d3, d1, d2, 0, l->num_inner_lattice_sites, l );
-  diff = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading );
+  vector_double_minus( &d[2], &d[0], &d[1], 0, l->num_inner_lattice_sites, l );
+  diff = global_norm_double( &d[2], 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( &d[0], 0, l->num_inner_lattice_sites, l, no_threading );
   
   test0_PRECISION("depth: %d, correctness of odd even layout: %le\n", l->depth, diff );
     
   // --------------
   
-  vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l );
-  diag_oo_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading );
-  diag_oo_inv_PRECISION( f4, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
-  vector_PRECISION_minus( f4, f4, f1, 0, l->inner_vector_size, l );
+  vector_PRECISION_copy( &f[3], &f[0], 0, l->inner_vector_size, l );
+  diag_oo_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading );
+  diag_oo_inv_PRECISION( &f[3], &f[2], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
+  vector_PRECISION_minus( &f[3], &f[3], &f[0], 0, l->inner_vector_size, l );
 
-  diff = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading ));
+  diff = (PRECISION) (global_norm_PRECISION( &f[3], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f[0], 0, l->inner_vector_size, l, no_threading ));
   
   test0_PRECISION("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, diff );
     
   // transformation part
-  vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l );
+  vector_PRECISION_copy( &f[3], &f[0], 0, l->inner_vector_size, l );
   // even to odd
   // set odd part of f3 to 0. 
-  vector_PRECISION_define( f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
+  vector_PRECISION_define( &f[2], 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
   
-  hopping_term_PRECISION( f3, f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading );
-  diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
-  vector_PRECISION_plus( f4, f4, f5, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
+  hopping_term_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading );
+  diag_oo_inv_PRECISION( &f[4], &f[2], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
+  vector_PRECISION_plus( &f[3], &f[3], &f[4], l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
   
   // block diagonal part
-  apply_schur_complement_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading );
-  diag_oo_PRECISION( f3, f4, &(l->oe_op_PRECISION), l, no_threading );
+  apply_schur_complement_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading );
+  diag_oo_PRECISION( &f[2], &f[3], &(l->oe_op_PRECISION), l, no_threading );
   // back transformation part
-  diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
-  hopping_term_PRECISION( f3, f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading );
+  diag_oo_inv_PRECISION( &f[4], &f[3], &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
+  hopping_term_PRECISION( &f[2], &f[4], &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading );
   
-  vector_PRECISION_minus( f1, f2, f3, 0, l->inner_vector_size, l );
-  diff = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading ));
+  vector_PRECISION_minus( &f[0], &f[1], &f[2], 0, l->inner_vector_size, l );
+  diff = (PRECISION) (global_norm_PRECISION( &f[0], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( &f[1], 0, l->inner_vector_size, l, no_threading ));
   
   test0_PRECISION("depth: %d, correctness of odd even schur complement: %le\n", l->depth, diff );
-    
-  FREE( d1, complex_double, l->inner_vector_size );
-  FREE( d2, complex_double, l->inner_vector_size );
-  FREE( d3, complex_double, l->inner_vector_size );
-  FREE( f1, complex_PRECISION, l->inner_vector_size );
-  FREE( f2, complex_PRECISION, l->inner_vector_size );
-  FREE( f3, complex_PRECISION, l->inner_vector_size );
-  FREE( f4, complex_PRECISION, l->inner_vector_size );
-  FREE( f5, complex_PRECISION, l->inner_vector_size );
+  
+  for(int i=0; i<3; i++)
+    vector_double_free( &d[i], l, no_threading );
+
+  for(int i=0; i<5; i++)
+    vector_PRECISION_free( &f[i], l, no_threading );
 }
diff --git a/src/oddeven_generic.h b/src/oddeven_generic.h
index 4fac101..2d03e98 100644
--- a/src/oddeven_generic.h
+++ b/src/oddeven_generic.h
@@ -24,39 +24,39 @@
 
 struct Thread;
 
-  void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
+  void hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, operator_PRECISION_struct *op,
                                const int amount, level_struct *l, struct Thread *threading );
   
   void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l );
   void oddeven_free_PRECISION( level_struct *l );
   
-  void oddeven_to_serial_PRECISION( vector_double out, vector_PRECISION in, level_struct *l, struct Thread *threading );
-  void serial_to_oddeven_PRECISION( vector_PRECISION out, vector_double in, level_struct *l, struct Thread *threading );
+  void oddeven_to_serial_PRECISION( vector_double *out, vector_PRECISION *in, level_struct *l, struct Thread *threading );
+  void serial_to_oddeven_PRECISION( vector_PRECISION *out, vector_double *in, level_struct *l, struct Thread *threading );
   
-  void oddeven_to_block_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading );
-  void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, level_struct *l, struct Thread *threading );
+  void oddeven_to_block_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading );
+  void block_to_oddeven_PRECISION( vector_PRECISION *out, vector_PRECISION *in, level_struct *l, struct Thread *threading );
   
-  void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void block_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                      int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, 
+  void block_n_hopping_term_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi, 
                                        int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void block_diag_oo_inv_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void block_diag_oo_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                 int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+  void block_diag_ee_PRECISION( vector_PRECISION *eta, vector_PRECISION *phi,
                                 int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
   
-  void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void g5D_apply_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   
   void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l );
   
-  void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start,
+  void apply_block_schur_complement_PRECISION( vector_PRECISION *out, vector_PRECISION *in, int start,
                                                schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, vector_PRECISION latest_iter,
+  void block_solve_oddeven_PRECISION( vector_PRECISION *phi, vector_PRECISION *r, vector_PRECISION *latest_iter,
                                       int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
   
   void oddeven_PRECISION_test( level_struct *l );
diff --git a/src/operator_generic.c b/src/operator_generic.c
index f881b23..bbda504 100644
--- a/src/operator_generic.c
+++ b/src/operator_generic.c
@@ -29,12 +29,8 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) {
   op->backward_neighbor_table = NULL;
   op->translation_table = NULL;
   op->D = NULL;
-  op->D_vectorized = NULL;
-  op->D_transformed_vectorized = NULL;
   op->clover = NULL;
   op->clover_oo_inv = NULL;
-  op->clover_vectorized = NULL;
-  op->clover_oo_inv_vectorized = NULL;
   op->m0 = 0;
 #ifdef HAVE_TM
   op->mu = 0;
@@ -49,8 +45,6 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) {
   op->epsbar_ig5_odd_shift = 0;
   op->epsbar_term = NULL;
   op->clover_doublet_oo_inv = NULL;
-  op->clover_doublet_vectorized = NULL;
-  op->clover_doublet_oo_inv_vectorized = NULL;
 #endif
   
   for ( int mu=0; mu<4; mu++ )
@@ -71,7 +65,7 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op,
   // when used as preconditioner we usually do not need the projection buffers, unless
   // g.method >= 4: then oddeven_setup_float() is called in init.c, method_setup().
   if ( l->depth == 0 ) {
-    int its = (l->num_lattice_site_var/2)*l->num_lattice_sites;
+    int its = (l->num_lattice_site_var/2)*l->num_lattice_sites*g.num_rhs_vect;
 #ifdef HAVE_TM1p1
     its *= 2;
 #endif
@@ -83,7 +77,7 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op,
 void operator_PRECISION_free_projection_buffers( operator_PRECISION_struct *op, level_struct *l ) {
 
   if ( l->depth == 0 ) {
-    int its = (l->num_lattice_site_var/2)*l->num_lattice_sites;
+    int its = (l->num_lattice_site_var/2)*l->num_lattice_sites*g.num_rhs_vect;
 #ifdef HAVE_TM1p1
     its *= 2;
 #endif
@@ -144,8 +138,6 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le
   MALLOC( op->translation_table, int, l->num_inner_lattice_sites );
 
   if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) {
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-
     if( g.csw ) {
 #ifdef HAVE_TM //we use LU here
       MALLOC( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) );
@@ -155,15 +147,6 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le
     }
 #ifdef HAVE_TM1p1
     MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) );
-#endif
-
-#else
-    if( g.csw )
-      MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION );
-#ifdef HAVE_TM1p1
-    MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION );
-#endif
-
 #endif
   }  
 
@@ -224,8 +207,6 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev
   FREE( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
 #endif
   if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) {
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-
     if( g.csw ) {
 #ifdef HAVE_TM //we use LU here
       FREE( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) );
@@ -235,15 +216,6 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev
     }
 #ifdef HAVE_TM1p1
     FREE( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) );
-#endif
-
-#else
-    if( g.csw )
-      FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1) );
-#ifdef HAVE_TM1p1
-    FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1) );
-#endif
-
 #endif
   }  
 
@@ -338,45 +310,9 @@ void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_stru
 }
 
 void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l ) {
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  int i, n = 2*l->num_lattice_sites - l->num_inner_lattice_sites;
-
-  for ( i=0; i<n; i++ ) {
-    PRECISION *D_vectorized = op->D_vectorized + 96*i;
-    PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i;
-    complex_PRECISION *D_pt = op->D + 36*i;
-    for ( int mu=0; mu<4; mu++ )
-      set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_pt+9*mu );
-  }
-#endif
-
 }
 
 void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l ) {
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  int i, n = l->num_inner_lattice_sites;
-  
-  if ( g.csw != 0 )
-    for ( i=0; i<n; i++ ) {
-      PRECISION *clover_vectorized_pt = op->clover_vectorized + 144*i;
-      config_PRECISION clover_pt = op->clover + 42*i;
-      sse_set_clover_PRECISION( clover_vectorized_pt, clover_pt );
-#ifdef HAVE_TM1p1
-      PRECISION *clover_doublet_vectorized_pt = op->clover_doublet_vectorized + 288*i;
-      sse_set_clover_doublet_PRECISION( clover_doublet_vectorized_pt, clover_pt );
-#endif
-#ifdef HAVE_TM
-      config_PRECISION tm_term_pt = op->tm_term + 12*i;
-      sse_add_diagonal_clover_PRECISION( clover_vectorized_pt, tm_term_pt );
-#ifdef HAVE_TM1p1
-      sse_add_diagonal_clover_doublet_PRECISION( clover_doublet_vectorized_pt, tm_term_pt );
-#endif
-#endif
-    }
-#endif
-  
 }
 
 void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
@@ -390,44 +326,54 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc
 * If enabled, also tests odd even preconditioning.
 *********************************************************************************/ 
 
-  int ivs = l->inner_vector_size;
-  double diff;
+  int ivs = l->inner_vector_size, n_vect=g.num_rhs_vect;
+  double diff, diff1[n_vect], diff2[n_vect];
   
-  vector_double vd1=NULL, vd2, vd3, vd4;
-  vector_PRECISION vp1=NULL, vp2;
-
-  PUBLIC_MALLOC( vd1, complex_double, 4*ivs );
-  PUBLIC_MALLOC( vp1, complex_PRECISION, 2*ivs );
+  vector_double vd[4];
+  vector_PRECISION vp[2];
 
-  vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs;
+  for(int i=0; i<4; i++){
+    vector_double_init( &vd[i] );
+    vector_double_alloc( &vd[i], _INNER, n_vect, l, threading );
+  }
+  
+  for(int i=0; i<2; i++){
+    vector_PRECISION_init( &vp[i] );
+    vector_PRECISION_alloc( &vp[i], _INNER, n_vect, l, threading );
+  }
 
   START_LOCKED_MASTER(threading)
   
-  vector_double_define_random( vd1, 0, l->inner_vector_size, l );
-  apply_operator_double( vd2, vd1, &(g.p), l, no_threading );
+  //vector_double_define_random( &vd[0], 0, l->inner_vector_size, l );
+  vector_double_define_random_new( &vd[0], l, no_threading ); 
+  apply_operator_double( &vd[1], &vd[0], &(g.p), l, no_threading );
+
+  trans_PRECISION_new( &vp[0], &vd[0], op->translation_table, l, no_threading );
+  apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, no_threading );
+  trans_back_PRECISION_new( &vd[2], &vp[1], op->translation_table, l, no_threading );
   
-  trans_PRECISION( vp1, vd1, op->translation_table, l, no_threading );
-  apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );
-  trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading );
+  //vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l );
+  vector_double_minus_new( &vd[3], &vd[2], &vd[1], l, no_threading );
+  //diff = global_norm_double( &vd[3], 0, ivs, l, no_threading )/
+  //    global_norm_double( &vd[2], 0, ivs, l, no_threading );
+  global_norm_double_new( diff1, &vd[3], l, no_threading );
+  global_norm_double_new( diff2, &vd[2], l, no_threading );
   
-  vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l );
-  diff = global_norm_double( vd4, 0, ivs, l, no_threading )/
-    global_norm_double( vd3, 0, ivs, l, no_threading );
-
-  test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff );
+  for(int i=0; i<n_vect; i++)
+    test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff1[i]/diff2[i] );
   END_LOCKED_MASTER(threading)
 
   if(threading->n_core > 1) {
-    apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, threading );
+    apply_operator_PRECISION( &vp[1], &vp[0], &(l->p_PRECISION), l, threading );
 
     SYNC_MASTER_TO_ALL(threading)
     SYNC_CORES(threading)
 
     START_LOCKED_MASTER(threading)
-    trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading );
-    vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l );
-    diff = global_norm_double( vd4, 0, ivs, l, no_threading ) /
-      global_norm_double( vd3, 0, ivs, l, no_threading );
+    trans_back_PRECISION( &vd[2], &vp[1], op->translation_table, l, no_threading );
+    vector_double_minus( &vd[3], &vd[2], &vd[1], 0, l->inner_vector_size, l );
+    diff = global_norm_double( &vd[3], 0, ivs, l, no_threading ) /
+      global_norm_double( &vd[2], 0, ivs, l, no_threading );
 
     if ( diff > EPS_PRECISION )
       printf0("\x1b[31m");
@@ -438,9 +384,14 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc
 
     END_LOCKED_MASTER(threading) 
   }    
-  
-  PUBLIC_FREE( vd1, complex_double, 4*ivs );
-  PUBLIC_FREE( vp1, complex_PRECISION, 2*ivs );
+
+  for(int i=0; i<4; i++){
+    vector_double_free( &vd[i], l, threading );
+  }
+
+  for(int i=0; i<2; i++){
+    vector_PRECISION_free( &vp[i], l, threading );
+  }
 
   START_LOCKED_MASTER(threading)
   if ( g.method >=4 && g.odd_even )
diff --git a/src/preconditioner.c b/src/preconditioner.c
index d7065d4..bd2a401 100644
--- a/src/preconditioner.c
+++ b/src/preconditioner.c
@@ -22,19 +22,19 @@
 #include "main.h"
 #include "preconditioner.h"
 
-void preconditioner( vector_double phi, vector_double Dphi, vector_double eta,
+void preconditioner( vector_double *phi, vector_double *Dphi, vector_double *eta,
                       const int res, level_struct *l, struct Thread *threading ) {
   if ( g.method == 0 )
     vector_double_copy( phi, eta, threading->start_index[l->depth], threading->end_index[l->depth], l );
   else if ( g.method < 5 || g.method == 6 || !g.odd_even ) {
     if ( g.mixed_precision ) {
-      trans_float( l->sbuf_float[0], eta, l->s_float.op.translation_table, l, threading );
-      vcycle_float( l->sbuf_float[1], NULL, l->sbuf_float[0], res, l, threading );
-      trans_back_float( phi, l->sbuf_float[1], l->s_float.op.translation_table, l, threading );
+      trans_float( &(l->sbuf_float[0]), eta, l->s_float.op.translation_table, l, threading );
+      vcycle_float( &(l->sbuf_float[1]), NULL, &(l->sbuf_float[0]), res, l, threading );
+      trans_back_float( phi, &(l->sbuf_float[1]), l->s_float.op.translation_table, l, threading );
     } else {
-      trans_double( l->sbuf_double[0], eta, l->s_double.op.translation_table, l, threading );
-      vcycle_double( l->sbuf_double[1], NULL, l->sbuf_double[0], res, l, threading );
-      trans_back_double( phi, l->sbuf_double[1], l->s_double.op.translation_table, l, threading );
+      trans_double( &(l->sbuf_double[0]), eta, l->s_double.op.translation_table, l, threading );
+      vcycle_double( &(l->sbuf_double[1]), NULL, &(l->sbuf_double[0]), res, l, threading );
+      trans_back_double( phi, &(l->sbuf_double[1]), l->s_double.op.translation_table, l, threading );
     }
   } else {
     if ( g.mixed_precision ) {
@@ -42,25 +42,25 @@ void preconditioner( vector_double phi, vector_double Dphi, vector_double eta,
       l->sp_float.num_restart = l->n_cy;
       l->sp_float.initial_guess_zero = res;
       END_LOCKED_MASTER(threading)
-      serial_to_oddeven_float( l->sp_float.b, eta, l, threading );
+      serial_to_oddeven_float( &(l->sp_float.b), eta, l, threading );
       if ( g.method == 6 ) {
         g5D_solve_oddeven_float( &(l->sp_float), &(l->oe_op_float), l, threading );
       } else {
         solve_oddeven_float( &(l->sp_float), &(l->oe_op_float), l, threading );
       }
-      oddeven_to_serial_float( phi, l->sp_float.x, l, threading );
+      oddeven_to_serial_float( phi, &(l->sp_float.x), l, threading );
     } else {
       START_LOCKED_MASTER(threading)
       l->sp_double.num_restart = l->n_cy;
       l->sp_double.initial_guess_zero = res;
       END_LOCKED_MASTER(threading)
-      serial_to_oddeven_double( l->sp_double.b, eta, l, threading );
+      serial_to_oddeven_double( &(l->sp_double.b), eta, l, threading );
       if ( g.method == 6 ) {
         g5D_solve_oddeven_double( &(l->sp_double), &(l->oe_op_double), l, threading );
       } else {
         solve_oddeven_double( &(l->sp_double), &(l->oe_op_double), l, threading );
       }
-      oddeven_to_serial_double( phi, l->sp_double.x, l, threading );
+      oddeven_to_serial_double( phi, &(l->sp_double.x), l, threading );
     }
     
   }
diff --git a/src/preconditioner.h b/src/preconditioner.h
index 783c70c..d3f0b02 100644
--- a/src/preconditioner.h
+++ b/src/preconditioner.h
@@ -29,6 +29,6 @@
   #include "schwarz_float.h"
   #include "schwarz_double.h"
 
-  void preconditioner( vector_double phi, vector_double Dphi, vector_double eta,
+  void preconditioner( vector_double *phi, vector_double *Dphi, vector_double *eta,
                        const int res, level_struct *l, struct Thread *threading );
 #endif
diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c
index 01becd3..ee5dc4b 100644
--- a/src/schwarz_generic.c
+++ b/src/schwarz_generic.c
@@ -32,7 +32,8 @@ void smoother_PRECISION_def( level_struct *l ) {
   if ( g.method == 6 ) {
     l->p_PRECISION.eval_operator = (l->depth > 0)?g5D_apply_coarse_operator_PRECISION:g5D_plus_clover_PRECISION;
   } else {
-    l->p_PRECISION.eval_operator = (l->depth > 0)?apply_coarse_operator_PRECISION:d_plus_clover_PRECISION;
+    //l->p_PRECISION.eval_operator = (l->depth > 0)?apply_coarse_operator_PRECISION:d_plus_clover_PRECISION;
+    l->p_PRECISION.eval_operator = d_plus_clover_PRECISION_new;
   }
 }
 
@@ -51,16 +52,16 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) {
   s->index[T] = NULL;
   s->oe_index[T] = NULL;
   s->block = NULL;
-  s->buf1 = NULL;
-  s->buf2 = NULL;
-  s->buf3 = NULL;
-  s->buf4 = NULL;
-  s->buf5 = NULL;
-  l->sbuf_PRECISION[0] = NULL;
-  s->oe_buf[0] = NULL;
-  s->oe_buf[1] = NULL;
-  s->oe_buf[2] = NULL;
-  s->oe_buf[3] = NULL;
+  vector_PRECISION_init(&(s->buf1));
+  vector_PRECISION_init(&(s->buf2));
+  vector_PRECISION_init(&(s->buf3));
+  vector_PRECISION_init(&(s->buf4));
+  vector_PRECISION_init(&(s->buf5));
+  vector_PRECISION_init(&(l->sbuf_PRECISION[0]));
+  vector_PRECISION_init(&(s->oe_buf[0]));
+  vector_PRECISION_init(&(s->oe_buf[1]));
+  vector_PRECISION_init(&(s->oe_buf[2]));
+  vector_PRECISION_init(&(s->oe_buf[3]));
   s->local_minres_buffer[0] = NULL;
   s->local_minres_buffer[1] = NULL;
   s->local_minres_buffer[2] = NULL;
@@ -69,25 +70,24 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) {
   s->num_colors = 0;
 }
 
-
 void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
   
   int i, j, n, mu, nu, *bl = l->block_lattice;
   
   if ( g.method == 4 ) {
-    fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size,
+    fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?_INNER:_ORDINARY,
                                    EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL,
                                    (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION):
                                    (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION),
                                    &(l->sp_PRECISION), l );
   } else if ( g.method == 5 ) {
-    fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?l->inner_vector_size:l->vector_size,
+    fgmres_PRECISION_struct_alloc( 5, 1, (l->depth==0)?_INNER:_ORDINARY,
                                    EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL,
                                    (l->depth==0)?(g.odd_even?apply_schur_complement_PRECISION:d_plus_clover_PRECISION):
                                    (g.odd_even?coarse_apply_schur_complement_PRECISION:apply_coarse_operator_PRECISION),
                                    &(l->sp_PRECISION), l );
   } else if ( g.method == 6 ) {
-    fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size,
+    fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?_INNER:_ORDINARY,
                                    EPS_PRECISION, _COARSE_GMRES, _NOTHING, NULL,
                                    (l->depth==0)?(g.odd_even?g5D_apply_schur_complement_PRECISION:g5D_plus_clover_PRECISION):
                                    (g.odd_even?g5D_coarse_apply_schur_complement_PRECISION:g5D_apply_coarse_operator_PRECISION),
@@ -140,18 +140,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
   
   MALLOC( s->block, block_struct, s->num_blocks );
 
-  int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size;
+  int svs = l->schwarz_vector_size;
+  int nvec = 1;
 
 #ifdef HAVE_TM1p1
   svs *= 2;
-  vs *= 2;
+  nvec = 2;
 #endif
 
   if ( l->depth == 0 ) {
-    MALLOC( s->oe_buf[0], complex_PRECISION, 4*vs );
-    s->oe_buf[1] = s->oe_buf[0] + vs;
-    s->oe_buf[2] = s->oe_buf[1] + vs;
-    s->oe_buf[3] = s->oe_buf[2] + vs;
+    for ( i=0; i<4; i++ ) {
+      vector_PRECISION_init( &(s->oe_buf[i]) );
+      vector_PRECISION_alloc( &(s->oe_buf[i]), _INNER, nvec, l, no_threading );
+    }
   }
   
   n = 0;
@@ -172,37 +173,31 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
     s->block[i].bt = NULL;
     MALLOC( s->block[i].bt, int, n );
   }
-
-  MALLOC( s->buf1, complex_PRECISION, vs+3*svs );
-  s->buf2 = s->buf1 + vs;
-  s->buf3 = s->buf2 + svs;
-  s->buf4 = s->buf3 + svs;
+  vector_PRECISION_init( &(s->buf1) );
+  vector_PRECISION_init( &(s->buf2) );
+  vector_PRECISION_init( &(s->buf3) );
+  vector_PRECISION_init( &(s->buf4) );
+
+  vector_PRECISION_alloc( &(s->buf1), (l->depth==0)?_INNER:_ORDINARY, nvec, l, no_threading );
+  vector_PRECISION_alloc( &(s->buf2), _SCHWARZ, nvec, l, no_threading );
+  vector_PRECISION_alloc( &(s->buf3), _SCHWARZ, nvec, l, no_threading );
+  vector_PRECISION_alloc( &(s->buf4), _SCHWARZ, nvec, l, no_threading );
     
-  if ( g.method == 1 )
-    MALLOC( s->buf5, complex_PRECISION, svs );
-  
-  MALLOC( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs );
-  l->sbuf_PRECISION[1] = l->sbuf_PRECISION[0] + vs;
+  if ( g.method == 1 ){
+    vector_PRECISION_init( &(s->buf5) );
+    vector_PRECISION_alloc( &(s->buf5), _SCHWARZ, nvec, l, no_threading );
+  }
+
+  for ( i=0; i<2; i++ ) {
+    vector_PRECISION_init( &(l->sbuf_PRECISION[i]) );
+    vector_PRECISION_alloc( &(l->sbuf_PRECISION[i]), (l->depth==0)?_INNER:_ORDINARY, nvec, l, no_threading ); 
+  }
 
   // these buffers are introduced to make local_minres_PRECISION thread-safe
   MALLOC( s->local_minres_buffer[0], complex_PRECISION, svs );
   MALLOC( s->local_minres_buffer[1], complex_PRECISION, svs );
   MALLOC( s->local_minres_buffer[2], complex_PRECISION, svs );
   
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  if ( l->depth == 0 ) {
-    MALLOC_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION );
-    MALLOC_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION );
-  }
-#endif
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  if ( l->depth == 0 ) {
-    MALLOC_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
-#ifdef HAVE_TM1p1
-    MALLOC_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
-#endif
-  }
-#endif
 }
 
 
@@ -252,32 +247,27 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) {
   
   FREE( s->block, block_struct, s->num_blocks );
 
-  int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size;
+  int svs = l->schwarz_vector_size;
 
 #ifdef HAVE_TM1p1
   svs *= 2;
-  vs *= 2;
 #endif
- if ( l->depth == 0 ) {
-    s->oe_buf[1] = NULL;
-    s->oe_buf[2] = NULL;
-    s->oe_buf[3] = NULL;
-    FREE( s->oe_buf[0], complex_PRECISION, 4*vs );
-    s->oe_buf[0] = NULL;
-  }
+  if ( l->depth == 0 )
+    for ( i=0; i<4; i++ )
+      vector_PRECISION_free( &(s->oe_buf[i]), l, no_threading );
   
- 
-  FREE( s->buf1, complex_PRECISION, vs+3*svs );
-  s->buf2 = NULL; s->buf3 = NULL;
-  s->buf4 = NULL;
+  vector_PRECISION_free( &(s->buf1), l, no_threading );
+  vector_PRECISION_free( &(s->buf2), l, no_threading );
+  vector_PRECISION_free( &(s->buf3), l, no_threading );
+  vector_PRECISION_free( &(s->buf4), l, no_threading );
   
   if ( g.method == 1 )
-    FREE( s->buf5, complex_PRECISION, svs );
+    vector_PRECISION_free( &(s->buf5), l, no_threading );
   
   operator_PRECISION_free( &(s->op), _SCHWARZ, l );
   
-  FREE( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs );
-  l->sbuf_PRECISION[1] = NULL;
+  for ( i=0; i<2; i++ )
+    vector_PRECISION_free( &(l->sbuf_PRECISION[i]), l, no_threading );
 
   FREE( s->local_minres_buffer[0], complex_PRECISION, svs );
   FREE( s->local_minres_buffer[1], complex_PRECISION, svs );
@@ -286,20 +276,6 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) {
   s->local_minres_buffer[1] = NULL;
   s->local_minres_buffer[2] = NULL;
   
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  if ( l->depth == 0 ) {
-    FREE_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) );
-    FREE_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) );
-  }
-#endif
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  if ( l->depth == 0 ) {
-    FREE_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size );
-#ifdef HAVE_TM1p1
-    FREE_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size );  
-#endif
-  }
-#endif
 }
 
 
@@ -649,7 +625,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc
 
   int i, t, z, y, x, mu, nu, index, *it = s->op.index_table, *dt = s->op.table_dim,
       ls[4], le[4], buf_length[4], link_size;
-  vector_PRECISION buf[4] = {NULL,NULL,NULL,NULL}, rbuf[4] = {NULL,NULL,NULL,NULL};
+  buffer_PRECISION buf[4] = {NULL,NULL,NULL,NULL}, rbuf[4] = {NULL,NULL,NULL,NULL};
   config_PRECISION D=s->op.D;
   
   for ( mu=0; mu<4; mu++ ) {
@@ -694,7 +670,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc
           for ( y=ls[Y]; y<le[Y]; y++ )
             for ( x=ls[X]; x<le[X]; x++ ) {
               index = site_index( t, z, y, x, dt, it );
-              vector_PRECISION_copy( buf[mu]+i*link_size, D+index*link_size, 0, link_size, l );
+              buffer_PRECISION_copy( buf[mu]+i*link_size, D+index*link_size, 0, link_size, l );
               i++;
             }
       MPI_Isend( buf[mu], buf_length[mu], MPI_COMPLEX_PRECISION, l->neighbor_rank[2*mu],
@@ -715,7 +691,7 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc
           for ( y=ls[Y]; y<le[Y]; y++ )
             for ( x=ls[X]; x<le[X]; x++ ) {
               index = site_mod_index( t, z, y, x, dt, it );
-              vector_PRECISION_copy( D+index*link_size, rbuf[mu]+i*link_size, 0, link_size, l );
+              buffer_PRECISION_copy( D+index*link_size, rbuf[mu]+i*link_size, 0, link_size, l );
               i++;
             }
       ls[mu] = 0;
@@ -734,24 +710,13 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc
 }
 
 
-void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
+void block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k,
                                   schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
   int *bbl = s->block_boundary_length;
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  PRECISION *Dplus = s->op.D_vectorized;
-  PRECISION *Dminus = s->op.D_transformed_vectorized;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi,
-                                              mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL );
-    boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi,
-                                               mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL );
-  }
-#else
   int i, mu, index, neighbor_index;
   config_PRECISION D_pt, D = s->op.D;
-  vector_PRECISION phi_pt, eta_pt;
+  buffer_PRECISION phi_pt, eta_pt;
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2 ) {
@@ -762,8 +727,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_T_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -776,8 +741,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_T_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -792,8 +757,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_Z_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -806,8 +771,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_Z_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -822,8 +787,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_Y_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -836,8 +801,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_Y_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -852,8 +817,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_X_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -866,8 +831,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_X_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -884,8 +849,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_T_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -896,8 +861,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_T_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -910,8 +875,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_Z_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -922,8 +887,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_Z_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -936,8 +901,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_Y_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -948,8 +913,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_Y_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -962,8 +927,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_X_PRECISION( buf1, phi_pt );
       mvm_PRECISION( buf2, D_pt, buf1 );
       mvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -974,8 +939,8 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_X_PRECISION( buf1, phi_pt );
       mvmh_PRECISION( buf2, D_pt, buf1 );
       mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -984,28 +949,16 @@ void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, in
 #ifdef HAVE_TM1p1
   }
 #endif
-#endif
 }
 
 
-void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
+void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k,
                                     schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
   int *bbl = s->block_boundary_length;
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-  PRECISION *Dplus = s->op.D_vectorized;
-  PRECISION *Dminus = s->op.D_transformed_vectorized;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi,
-                                               mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL );
-    boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi,
-                                                mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL );
-  }
-#else
   int i, mu, index, neighbor_index;
   config_PRECISION D_pt, D = s->op.D;
-  vector_PRECISION phi_pt, eta_pt;
+  buffer_PRECISION phi_pt, eta_pt;
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2 ) {
@@ -1016,8 +969,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_T_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1030,8 +983,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_T_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1046,8 +999,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_Z_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1060,8 +1013,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_Z_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1076,8 +1029,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_Y_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1090,8 +1043,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_Y_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1106,8 +1059,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprp_X_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1120,8 +1073,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 24*neighbor_index;
-      eta_pt = eta + 24*index;
+      phi_pt = phi->vector_buffer + 24*neighbor_index;
+      eta_pt = eta->vector_buffer + 24*index;
       dprn_X_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1138,8 +1091,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_T_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1150,8 +1103,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_T_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1164,8 +1117,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_Z_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1176,8 +1129,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_Z_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1190,8 +1143,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_Y_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1202,8 +1155,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_Y_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1216,8 +1169,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prp_X_PRECISION( buf1, phi_pt );
       nmvm_PRECISION( buf2, D_pt, buf1 );
       nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1228,8 +1181,8 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
       index = s->block[k].bt[i];
       neighbor_index = s->block[k].bt[i+1];
       D_pt = D + 36*neighbor_index + 9*mu;
-      phi_pt = phi + 12*neighbor_index;
-      eta_pt = eta + 12*index;
+      phi_pt = phi->vector_buffer + 12*neighbor_index;
+      eta_pt = eta->vector_buffer + 12*index;
       prn_X_PRECISION( buf1, phi_pt );
       nmvmh_PRECISION( buf2, D_pt, buf1 );
       nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
@@ -1237,41 +1190,15 @@ void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
     } 
 #ifdef HAVE_TM1p1
   }
-#endif
 #endif 
 }
 
 
-void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
+void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi,
                                          int k, schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
   int *bbl = s->block_boundary_length, n = l->num_lattice_site_var;
 
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset;
-    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset;
-    // plus mu direction
-    for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l );
-    }
-    // minus mu direction
-    for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l );
-    }
-  }
-#else
   config_PRECISION D = s->op.D;
   int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size;
   
@@ -1280,54 +1207,30 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION
     for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
       int index = s->block[k].bt[i];
       int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
+      vector_PRECISION phi_pt, eta_pt;
+      phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index;
+      eta_pt.vector_buffer = eta->vector_buffer + n*index;
       config_PRECISION D_pt = D + site_size*index + link_size*mu;
-      coarse_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l );
     }
     // minus mu direction
     for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
       int index = s->block[k].bt[i];
       int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
+      vector_PRECISION phi_pt, eta_pt;
+      phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index;
+      eta_pt.vector_buffer = eta->vector_buffer + n*index;
       config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu;
-      coarse_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l );
     }
   }
-#endif
 }
 
 
-void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
+void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi,
                                            int k, schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
   int *bbl = s->block_boundary_length, n = l->num_lattice_site_var;
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset;
-    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset;
-    // plus mu direction
-    for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l );
-    }
-    // minus mu direction
-    for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l );
-    }
-  }  
-#else
   int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size;
   config_PRECISION D = s->op.D;
   
@@ -1336,22 +1239,23 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO
     for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
       int index = s->block[k].bt[i];
       int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
+      vector_PRECISION phi_pt, eta_pt; 
+      phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index;
+      eta_pt.vector_buffer = eta->vector_buffer + n*index;
       config_PRECISION D_pt = D + site_size*index + link_size*mu;
-      coarse_n_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_n_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l );
     }
     // minus mu direction
     for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
       int index = s->block[k].bt[i];
       int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
+      vector_PRECISION phi_pt, eta_pt; 
+      phi_pt.vector_buffer = phi->vector_buffer + n*neighbor_index;
+      eta_pt.vector_buffer = eta->vector_buffer + n*index;
       config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu;
-      coarse_n_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_n_daggered_hopp_PRECISION( &eta_pt, &phi_pt, D_pt, l );
     }
   }
-#endif
 }
 
 
@@ -1412,18 +1316,19 @@ void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struc
 
 }
 
-void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, 
+void additive_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, 
                                  schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_NO_HYPERTHREADS(threading)
   
   int k, mu, i, nb = s->num_blocks;
-  vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3, latest_iter2 = s->buf5, swap = NULL;
+  vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3), *latest_iter2 = &(s->buf5), *swap = NULL;
   void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
   void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
-       (* block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
+       (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
 
+  //vector_PRECISION_init(swap);
 
   int nb_thread_start;
   int nb_thread_end;
@@ -1594,16 +1499,13 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
 }
 
 
-void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res,
+void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res,
                                   schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_NO_HYPERTHREADS(threading)
   
   int k=0, mu, i, init_res = res, res_comm = res, step;
-  vector_PRECISION r = s->buf1;
-  vector_PRECISION Dphi = s->buf4;
-  vector_PRECISION latest_iter = s->buf2;
-  vector_PRECISION x = s->buf3;
+  vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3);
   void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
   void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
@@ -1612,7 +1514,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
   int commdir[8] = {+1,-1,-1,+1,-1,+1,+1,-1};
        
   SYNC_CORES(threading)
-       
+  
   int block_thread_start[8], block_thread_end[8];
   for ( i=0; i<8; i++ )
      compute_core_start_end_custom(0, s->block_list_length[i], block_thread_start+i, block_thread_end+i, l, threading, 1 );
@@ -1658,9 +1560,9 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
         PROF_PRECISION_STOP( _SM3, 1 );
         PROF_PRECISION_START( _SM4 );
         END_MASTER(threading)
-        // local minres updates x, r and latest iter
+	// local minres updates x, r and latest iter
         block_solve( x, r, latest_iter, s->block[index].start*l->num_lattice_site_var, s, l, no_threading );
-        START_MASTER(threading)
+	START_MASTER(threading)
         PROF_PRECISION_STOP( _SM4, 1 );
         END_MASTER(threading)
       }
@@ -1764,16 +1666,13 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
 }
 
 
-void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res,
+void schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res,
                         schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_NO_HYPERTHREADS(threading)
 
   int color, k, mu, i, nb = s->num_blocks, init_res = res;
-  vector_PRECISION r = s->buf1;
-  vector_PRECISION Dphi = s->buf4;
-  vector_PRECISION latest_iter = s->buf2;
-  vector_PRECISION x = s->buf3;
+  vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3);
   void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
   void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
@@ -1980,7 +1879,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
 }
 
 
-void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, 
+void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, 
                                       schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_NO_HYPERTHREADS(threading)
@@ -1989,7 +1888,7 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
   if ( s->num_colors == 2 ) schwarz_PRECISION( phi, D_phi, eta, cycles, res, s, l, no_threading );
   else {
     int color, k, mu, i, nb = s->num_blocks;
-    vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3;
+    vector_PRECISION *r = &(s->buf1), *Dphi = &(s->buf4), *latest_iter = &(s->buf2), *x = &(s->buf3);
     void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
     void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
          (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
@@ -2101,11 +2000,11 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
     
 #ifdef SCHWARZ_RES
     START_LOCKED_MASTER(threading)
-    vector_PRECISION true_r = NULL;
-
-    PUBLIC_MALLOC( true_r, complex_PRECISION, l->vector_size );
-    vector_PRECISION_define( true_r, 0, 0, l->inner_vector_size, l );
+    vector_PRECISION true_r;
+    vector_PRECISION_init(&true_r);
 
+    vector_PRECISION_alloc( &true_r, _ORDINARY, 1, l, threading );
+    vector_PRECISION_define( &true_r, 0, 0, l->inner_vector_size, l );
 
     if ( D_phi == NULL ) {
       for ( mu=0; mu<4; mu++ ) {
@@ -2113,24 +2012,24 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
         ghost_update_PRECISION( x, mu, -1, &(s->op.c), l );
       }
       for ( i=0; i<nb; i++ ) {
-        block_op( true_r, x, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+        block_op( &true_r, x, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
       }
       for ( mu=0; mu<4; mu++ ) {
         ghost_update_wait_PRECISION( x, mu, +1, &(s->op.c), l );
         ghost_update_wait_PRECISION( x, mu, -1, &(s->op.c), l );
       }
       for ( i=0; i<nb; i++ ) {
-        boundary_op( true_r, x, i, s, l );
+        boundary_op( &true_r, x, i, s, l );
       }
     }
-    vector_PRECISION_saxpy( true_r, eta, true_r, -1, 0, l->inner_vector_size, l );
-    PRECISION r_norm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading ),
+    vector_PRECISION_saxpy( &true_r, eta, &true_r, -1, 0, l->inner_vector_size, l );
+    PRECISION r_norm = global_norm_PRECISION( &true_r, 0, l->inner_vector_size, l, no_threading ),
       den = global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading );
     r_norm/=den;
     char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number );
     printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm );
     printf0("\033[0m\n"); fflush(0);
-    PUBLIC_FREE( true_r, complex_PRECISION, l->vector_size );
+    vector_PRECISION_free( &true_r, l, threading );
     END_LOCKED_MASTER(threading)
 #endif
   }
@@ -2139,10 +2038,10 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
 }
 
 
-void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_struct *l, struct Thread *threading ) {
+void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ) {
   
   int i, index;
-  vector_PRECISION out_pt = out; vector_double in_pt = in;
+  buffer_PRECISION out_pt = out->vector_buffer; buffer_double in_pt = in->vector_buffer;
   int start = threading->start_site[l->depth];
   int end   = threading->end_site[l->depth];
 
@@ -2153,16 +2052,16 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str
   if( g.n_flavours == 2 )
     for ( i=start; i<end; i++ ) {
       index = tt[i];
-      out_pt = out + 24*index;
-      in_pt  = in + 24*i;
+      out_pt = out->vector_buffer + 24*index;
+      in_pt  = in->vector_buffer + 24*i;
       FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; )
     }
   else
 #endif
   for ( i=start; i<end; i++ ) {
     index = tt[i];
-    out_pt = out + 12*index;
-    in_pt  = in + 12*i;
+    out_pt = out->vector_buffer + 12*index;
+    in_pt  = in->vector_buffer + 12*i;
     FOR12( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; )
   }
   END_NO_HYPERTHREADS(threading)
@@ -2170,10 +2069,10 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str
 }
 
 
-void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, level_struct *l, struct Thread *threading ) {
+void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ) {
   
   int i, index;
-  vector_double out_pt = out; vector_PRECISION in_pt = in;
+  buffer_double out_pt = out->vector_buffer; buffer_PRECISION in_pt = in->vector_buffer;
   int start = threading->start_site[l->depth];
   int end   = threading->end_site[l->depth];
 
@@ -2184,16 +2083,16 @@ void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, leve
   if( g.n_flavours == 2 )
     for ( i=start; i<end; i++ ) {
       index = tt[i];
-      in_pt = in + 24*index;
-      out_pt = out + 24*i;
+      in_pt = in->vector_buffer + 24*index;
+      out_pt = out->vector_buffer + 24*i;
       FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; )
     }
   else
 #endif
   for ( i=start; i<end; i++ ) {
     index = tt[i];
-    in_pt = in + 12*index;
-    out_pt = out + 12*i;
+    in_pt = in->vector_buffer + 12*index;
+    out_pt = out->vector_buffer + 12*i;
     FOR12( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; )
   }
   END_NO_HYPERTHREADS(threading)
@@ -2201,6 +2100,79 @@ void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, leve
 }
 
 
+void trans_PRECISION_new( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading ) {
+  
+  int i, j, k, index;
+  buffer_PRECISION out_pt = out->vector_buffer; buffer_double in_pt = in->vector_buffer;
+  int start = threading->start_site[l->depth];
+  int end   = threading->end_site[l->depth];
+  //compute_core_start_end(0, in->size, &start, &end, l, threading);
+
+  // this function seems to do some data reordering, barriers ensure that everything is in sync
+  SYNC_CORES(threading)
+  START_NO_HYPERTHREADS(threading)
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 )
+    for ( i=start; i<end; i++ ) {
+      index = tt[i];
+      out_pt = out->vector_buffer + 24*index;
+      in_pt  = in->vector_buffer + 24*i;
+      FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; )
+    }
+  else
+#endif
+  for ( i=start; i<end; i++ ) {
+    index = tt[i];
+    out_pt = out->vector_buffer + 12*index*in->num_vect;
+    in_pt  = in->vector_buffer + 12*i*in->num_vect;
+    for( k=0; k<12; k++)  
+      for( j=0; j<in->num_vect; j++){
+        *out_pt = (complex_PRECISION) *in_pt;
+        out_pt++;
+        in_pt++;
+      }
+  }
+  END_NO_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+}
+
+
+void trans_back_PRECISION_new( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading ) {
+  
+  int i, j, k, index;
+  buffer_double out_pt = out->vector_buffer; buffer_PRECISION in_pt = in->vector_buffer;
+  int start = threading->start_site[l->depth];
+  int end   = threading->end_site[l->depth];
+  
+  // this function seems to do some data reordering, barriers ensure that everything is in sync
+  SYNC_CORES(threading)
+  START_NO_HYPERTHREADS(threading)
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 )
+    for ( i=start; i<end; i++ ) {
+      index = tt[i];
+      in_pt = in->vector_buffer + 24*index;
+      out_pt = out->vector_buffer + 24*i;
+      FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; )
+    }
+  else
+#endif
+  for ( i=start; i<end; i++ ) {
+    index = tt[i];
+    in_pt = in->vector_buffer + 12*index*in->num_vect;
+    out_pt = out->vector_buffer + 12*i*in->num_vect;
+    for( k=0; k<12; k++)
+      for( j=0; j<in->num_vect; j++){
+        *out_pt = (complex_double) *in_pt;
+        out_pt++;
+        in_pt++;
+      }
+  }
+  END_NO_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+}
+
+
 void schwarz_PRECISION_def( schwarz_PRECISION_struct *s, operator_double_struct *op, level_struct *l ) {
 
   schwarz_PRECISION_alloc( s, l );
@@ -2214,50 +2186,51 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l
   START_UNTHREADED_FUNCTION(threading)
 
   int mu, i, nb = s->num_blocks;
-  int svs = l->schwarz_vector_size;
   int ivs = l->inner_vector_size;
-  int vs = l->vector_size;
 
   void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
   void (*op)() = (l->depth==0)?d_plus_clover_PRECISION:apply_coarse_operator_PRECISION;
   void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op;
   
-  vector_PRECISION v1 = NULL, v2 = NULL, v3 = NULL;
+  vector_PRECISION v1, v2, v3;
   PRECISION diff;
   
-  MALLOC( v1, complex_PRECISION, svs );
-  MALLOC( v2, complex_PRECISION, vs );
-  MALLOC( v3, complex_PRECISION, vs );
+  vector_PRECISION_init( &v1 );
+  vector_PRECISION_init( &v2 );
+  vector_PRECISION_init( &v3 );
+
+  vector_PRECISION_alloc( &v1, _SCHWARZ, 1, l, no_threading );
+  vector_PRECISION_alloc( &v2, _ORDINARY, 1, l, no_threading );
+  vector_PRECISION_alloc( &v3, _ORDINARY, 1, l, no_threading );
 
-  vector_PRECISION_define_random( v1, 0, ivs, l );
+  vector_PRECISION_define_random( &v1, 0, ivs, l );
 
-  op( v3, v1, &(s->op), l, no_threading );
+  op( &v3, &v1, &(s->op), l, no_threading );
 
   for ( mu=0; mu<4; mu++ ) {
-    ghost_update_PRECISION( v1, mu, +1, &(s->op.c), l );
-    ghost_update_PRECISION( v1, mu, -1, &(s->op.c), l );
+    ghost_update_PRECISION( &v1, mu, +1, &(s->op.c), l );
+    ghost_update_PRECISION( &v1, mu, -1, &(s->op.c), l );
   }
       
   for ( mu=0; mu<4; mu++ ) {
-    ghost_update_wait_PRECISION( v1, mu, +1, &(s->op.c), l );
-    ghost_update_wait_PRECISION( v1, mu, -1, &(s->op.c), l );
+    ghost_update_wait_PRECISION( &v1, mu, +1, &(s->op.c), l );
+    ghost_update_wait_PRECISION( &v1, mu, -1, &(s->op.c), l );
   }
 
   for ( i=0; i<nb; i++ ) {
-    block_op( v2, v1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
-    boundary_op( v2, v1, i, s, l, no_threading );
+    block_op( &v2, &v1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+    boundary_op( &v2, &v1, i, s, l, no_threading );
   }
 
-  vector_PRECISION_minus( v3, v3, v2, 0, l->inner_vector_size, l );  
-  diff = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading ) /
-    global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading );
+  vector_PRECISION_minus( &v3, &v3, &v2, 0, l->inner_vector_size, l );  
+  diff = global_norm_PRECISION( &v3, 0, l->inner_vector_size, l, no_threading ) /
+    global_norm_PRECISION( &v2, 0, l->inner_vector_size, l, no_threading );
   
   test0_PRECISION("depth: %d, correctness of local residual vector: %le\n", l->depth, diff );
-      
-  FREE( v1, complex_PRECISION, l->schwarz_vector_size );
-  FREE( v2, complex_PRECISION, l->vector_size );
-  FREE( v3, complex_PRECISION, l->vector_size );
+  
+  vector_PRECISION_free( &v1, l, no_threading );
+  vector_PRECISION_free( &v2, l, no_threading );
+  vector_PRECISION_free( &v3, l, no_threading );    
 
   END_UNTHREADED_FUNCTION(threading)
 }
-
diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h
index fab1613..2bc22d7 100644
--- a/src/schwarz_generic.h
+++ b/src/schwarz_generic.h
@@ -24,13 +24,13 @@
 
 struct Thread;
 
-  void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
+  void block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k,
                                     schwarz_PRECISION_struct *s, level_struct *l );
-  void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
+  void n_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi, int k,
                                       schwarz_PRECISION_struct *s, level_struct *l );
-  void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
+  void coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi,
                                            int k, schwarz_PRECISION_struct *s, level_struct *l );
-  void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
+  void n_coarse_block_PRECISION_boundary_op( vector_PRECISION *eta, vector_PRECISION *phi,
                                              int k, schwarz_PRECISION_struct *s, level_struct *l );
   
   void smoother_PRECISION_def( level_struct *l );
@@ -46,17 +46,19 @@ struct Thread;
   void schwarz_PRECISION_def( schwarz_PRECISION_struct *s, operator_double_struct *op, level_struct *l );
   void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l );
   
-  void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, 
+  void additive_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, 
                                    schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res,
+  void red_black_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res,
                                     schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, 
+  void schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, 
                           schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
-  void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, 
+  void sixteen_color_schwarz_PRECISION( vector_PRECISION *phi, vector_PRECISION *D_phi, vector_PRECISION *eta, const int cycles, int res, 
                                         schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
   
-  void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_struct *l, struct Thread *threading );
-  void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, level_struct *l, struct Thread *threading );
+  void trans_PRECISION( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading );
+  void trans_back_PRECISION( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading );
+  void trans_PRECISION_new( vector_PRECISION *out, vector_double *in, int *tt, level_struct *l, struct Thread *threading );
+  void trans_back_PRECISION_new( vector_double *out, vector_PRECISION *in, int *tt, level_struct *l, struct Thread *threading );
   
   void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
   
@@ -74,22 +76,4 @@ struct Thread;
     }
   }
 
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
-static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) {
-  // out1: column major, out2: row major
-  for ( int i=0; i<3; i++ ) { // column
-    for ( int j=0; j<3; j++ ) { // row
-      out1[8*i  +j] = creal_PRECISION(in[3*j+i]);
-      out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]);
-      out2[8*i  +j] = creal_PRECISION(in[j+3*i]);
-      out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]);
-    }
-    out1[8*i+3] = 0.0;
-    out1[8*i+7] = 0.0;
-    out2[8*i+3] = 0.0;
-    out2[8*i+7] = 0.0;
-  }
-}
-#endif
-
 #endif 
diff --git a/src/setup_generic.c b/src/setup_generic.c
index 4493bae..e91c3c9 100644
--- a/src/setup_generic.c
+++ b/src/setup_generic.c
@@ -32,13 +32,8 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
     
     START_LOCKED_MASTER(threading)
     coarse_operator_PRECISION_alloc( l );
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
     coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
     END_LOCKED_MASTER(threading)
-#else
-    END_LOCKED_MASTER(threading)
-    coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
-#endif
     
     START_LOCKED_MASTER(threading)
     if ( !l->next_level->idle ) {
@@ -91,12 +86,12 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
     SYNC_HYPERTHREADS(threading)
     if ( !l->idle ) {
       for ( int i=0; i<MIN(l->next_level->num_eig_vect,l->num_eig_vect); i++ ) {
-        restrict_PRECISION( l->next_level->is_PRECISION.test_vector[i], l->is_PRECISION.test_vector[i], l, threading );
+        restrict_PRECISION( &(l->next_level->is_PRECISION.test_vector[i]), &(l->is_PRECISION.test_vector[i]), l, threading );
       }
       START_LOCKED_MASTER(threading)
       for ( int i=MIN(l->next_level->num_eig_vect,l->num_eig_vect); i<l->next_level->num_eig_vect; i++ ) {
         if ( !l->next_level->idle )
-          vector_PRECISION_define_random( l->next_level->is_PRECISION.test_vector[i], 0,
+          vector_PRECISION_define_random( &(l->next_level->is_PRECISION.test_vector[i]), 0,
                                           l->next_level->inner_vector_size, l->next_level );
       }
       END_LOCKED_MASTER(threading)
@@ -142,18 +137,19 @@ void read_tv_from_file_PRECISION( level_struct *l, struct Thread *threading ) {
 
       int n = l->num_eig_vect, i;
       char filename[STRINGLENGTH+1];
-      vector_double tmp = NULL;
+      vector_double tmp;
+      vector_double_init(&tmp);
       
-      MALLOC( tmp, complex_double, l->inner_vector_size );
+      vector_double_alloc( &tmp, _INNER, 1, l, no_threading );
       
       for ( i=0; i<n; i++ ) {
         sprintf( filename, "%s.%02d", g.tv_io_file_name, i );
         printf0("%s.%02d\n", g.tv_io_file_name, i );
-        vector_io( (double*)tmp, filename, _READ, l );
-        trans_PRECISION( l->is_PRECISION.test_vector[i], tmp, l->s_PRECISION.op.translation_table, l, no_threading );
+        vector_io( (double*)tmp.vector_buffer, filename, _READ, l );
+        trans_PRECISION( &(l->is_PRECISION.test_vector[i]), &tmp, l->s_PRECISION.op.translation_table, l, no_threading );
       }
       
-      FREE( tmp, complex_double, l->inner_vector_size );
+      vector_double_free( &tmp, l, no_threading );
 
       END_LOCKED_MASTER(threading)
 
@@ -197,20 +193,22 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
   int pi = 1, pn = n*6;
 #endif
   vector_PRECISION *buffer = NULL;
+
   int start = threading->start_index[l->depth];
   int end   = threading->end_index[l->depth];
     
   if ( V == NULL ) {
     
-    PUBLIC_MALLOC( buffer, complex_PRECISION*, 3 );
+    PUBLIC_MALLOC( buffer, vector_PRECISION, 3 );
     START_MASTER(threading)
-    buffer[0] = NULL;
+    vector_PRECISION_init(&buffer[0]);
     END_MASTER(threading)
-    PUBLIC_MALLOC( buffer[0], complex_PRECISION, l->vector_size*3 );
     
     START_MASTER(threading)
-    for( i=1; i<3; i++)
-      buffer[i] = buffer[0] + l->vector_size*i;
+    for( i=0; i<3; i++){
+      vector_PRECISION_init( &buffer[i] );
+      vector_PRECISION_alloc( &buffer[i], _ORDINARY, 1, l, threading );
+    }
     if ( g.print > 0 ) printf0("initial definition --- depth: %d\n", l->depth );
 #ifdef DEBUG
     if ( g.print > 0 ) { printf0("\033[0;42m\033[1;37m|"); fflush(0); }
@@ -221,16 +219,16 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
     for ( k=0; k<n; k++ ) {
 //       if ( l->depth == 0 ) {
         START_LOCKED_MASTER(threading)
-        vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l );
+        vector_PRECISION_define_random( &(l->is_PRECISION.test_vector[k]), 0, l->inner_vector_size, l );
         END_LOCKED_MASTER(threading)
 //       }
       
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], 1, _NO_RES, l, threading );
-      vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:2, _NO_RES, l, threading );
-      vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:3, _NO_RES, l, threading );
-      vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
+      smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), 1, _NO_RES, l, threading );
+      vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l );
+      smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), g.method>=4?1:2, _NO_RES, l, threading );
+      vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l );
+      smoother_PRECISION( &buffer[0], NULL, &(l->is_PRECISION.test_vector[k]), g.method>=4?1:3, _NO_RES, l, threading );
+      vector_PRECISION_copy( &(l->is_PRECISION.test_vector[k]), &buffer[0], start, end, l );
         
       pc += 6;
 #ifdef DEBUG
@@ -240,12 +238,14 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
 #endif
     }
     
-    PUBLIC_FREE( buffer[0], complex_PRECISION, l->vector_size*3 );
-    PUBLIC_FREE( buffer, complex_PRECISION*, 3 );
+    for( i=0; i<3; i++){ 
+      vector_PRECISION_free( &buffer[i], l, threading );
+    }
+    PUBLIC_FREE( buffer, vector_PRECISION, 3 );
     
     for ( k=0; k<n; k++ ) {
-      vector_PRECISION_real_scale( l->is_PRECISION.test_vector[k], l->is_PRECISION.test_vector[k],
-                                  1.0/global_norm_PRECISION( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l, threading ),
+      vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[k]), &(l->is_PRECISION.test_vector[k]),
+                                  1.0/global_norm_PRECISION( &(l->is_PRECISION.test_vector[k]), 0, l->inner_vector_size, l, threading ),
                                   start, end, l );
     }
     
@@ -257,27 +257,20 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
     
     } else {
     for ( i=0; i<n; i++ ) {
-      trans_PRECISION( l->is_PRECISION.test_vector[i], V[i], l->s_PRECISION.op.translation_table, l, threading );
+      trans_PRECISION( &(l->is_PRECISION.test_vector[i]), &V[i], l->s_PRECISION.op.translation_table, l, threading );
     }
   }
 
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
   for ( k=0; k<n; k++ ) {
-    vector_PRECISION_copy( l->is_PRECISION.interpolation[k], l->is_PRECISION.test_vector[k], start, end, l );
+    vector_PRECISION_copy( &(l->is_PRECISION.interpolation[k]), &(l->is_PRECISION.test_vector[k]), start, end, l );
   }
-#endif
   
   
     
   testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading );
 
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
-  define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading );  
-  gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading );
-#else
   gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading );
   define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading );
-#endif
   
 }
 
@@ -286,16 +279,8 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) {
   
   if ( l->level > 0 ) {
     if ( !l->idle ) {
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
-      define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading );
-      gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
-      if ( l->depth > 0 )
-        gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
-      coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
-      START_LOCKED_MASTER(threading)
-#else
       for ( int i=0; i<l->num_eig_vect; i++ ) {
-        vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i],
+        vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]),
                                threading->start_index[l->depth], threading->end_index[l->depth], l );
       }
       gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading );
@@ -304,7 +289,7 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) {
       define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading );
       START_LOCKED_MASTER(threading)
       coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
-#endif
+      
       conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level );
       END_LOCKED_MASTER(threading)
       if ( !l->next_level->idle && l->next_level->level > 0 ) {
@@ -331,15 +316,16 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) {
 void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ) {
   
   if ( !l->idle ) {
-    vector_PRECISION buf1 = NULL;
+    vector_PRECISION buf1;
     gmres_PRECISION_struct gmres;
     
     // TODO: bugfix - threading, etc
     
     START_LOCKED_MASTER(threading)
-    MALLOC( buf1, complex_PRECISION, l->vector_size );
+    vector_PRECISION_init( &buf1 );
+    vector_PRECISION_alloc( &buf1, _ORDINARY, 1, l, no_threading );
     fgmres_PRECISION_struct_init( &gmres );
-    fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, l->next_level->vector_size, g.coarse_tol, 
+    fgmres_PRECISION_struct_alloc( g.coarse_iter, g.coarse_restart, _ORDINARY, g.coarse_tol, 
                                    _COARSE_GMRES, _NOTHING, NULL, apply_coarse_operator_PRECISION, &gmres, l->next_level );
     
     if ( g.odd_even && l->next_level->level == 0 )
@@ -358,7 +344,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
 #endif
       END_MASTER(threading)
       for ( int i=0; i<l->num_eig_vect; i++ ) {
-        restrict_PRECISION( gmres.b, l->is_PRECISION.test_vector[i], l, threading );
+        restrict_PRECISION( &(gmres.b), &(l->is_PRECISION.test_vector[i]), l, threading );
         if ( !l->next_level->idle ) {
           if ( g.odd_even && l->next_level->level == 0 ) {
             coarse_solve_odd_even_PRECISION( &gmres, &(l->next_level->oe_op_PRECISION), l->next_level, threading );
@@ -366,10 +352,10 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
             fgmres_PRECISION( &gmres, l->next_level, threading );
           }
         }
-        interpolate3_PRECISION( buf1, gmres.x, l, threading );
-        smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, l, threading );
-        vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], buf1,
-                                     1.0/global_norm_PRECISION( buf1, 0, l->inner_vector_size, l, threading ),
+        interpolate3_PRECISION( &buf1, &(gmres.x), l, threading );
+        smoother_PRECISION( &buf1, NULL, &(l->is_PRECISION.test_vector[i]), l->post_smooth_iter, _RES, l, threading );
+        vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[i]), &buf1,
+                                     1.0/global_norm_PRECISION( &buf1, 0, l->inner_vector_size, l, threading ),
                                      threading->start_index[l->depth], threading->end_index[l->depth], l );
         pc += l->post_smooth_iter;
 #ifdef DEBUG
@@ -384,16 +370,8 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
       END_MASTER(threading)
 #endif
 
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
-      define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading );
-      gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
-      if ( l->depth > 0 )
-        gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
-      coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
-      START_LOCKED_MASTER(threading)
-#else
       for ( int i=0; i<l->num_eig_vect; i++ )
-        vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i],
+        vector_PRECISION_copy( &(l->is_PRECISION.interpolation[i]), &(l->is_PRECISION.test_vector[i]),
             threading->start_index[l->depth], threading->end_index[l->depth], l );
       gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading );
       if ( l->depth > 0 )
@@ -401,7 +379,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
       define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading );
       START_LOCKED_MASTER(threading)
       coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
-#endif
+
       conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level );
       END_LOCKED_MASTER(threading)
       if ( !l->next_level->idle && l->next_level->level > 0 ) {
@@ -425,7 +403,7 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
       inv_iter_2lvl_extension_setup_PRECISION( setup_iter, l->next_level, threading );
 
     START_LOCKED_MASTER(threading)
-    FREE( buf1, complex_PRECISION, l->vector_size );
+    vector_PRECISION_free( &buf1, l, no_threading );
     fgmres_PRECISION_struct_free( &gmres, l );
     END_LOCKED_MASTER(threading)
   }
@@ -448,17 +426,19 @@ void test_vector_PRECISION_update( int i, level_struct *l, struct Thread *thread
     test_vector_PRECISION_update( i, l->next_level, threading );
   
   if ( !l->idle )
-    vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], l->p_PRECISION.x,
-                                 1.0/global_norm_PRECISION( l->p_PRECISION.x, 0, l->inner_vector_size, l, threading ),
+    vector_PRECISION_real_scale( &(l->is_PRECISION.test_vector[i]), &(l->p_PRECISION.x),
+                                 1.0/global_norm_PRECISION( &(l->p_PRECISION.x), 0, l->inner_vector_size, l, threading ),
                                  threading->start_index[l->depth], threading->end_index[l->depth], l );
 }
 
 
 void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading ) {
   
-  vector_PRECISION v_buf = NULL;
+  vector_PRECISION v_buf;
   complex_PRECISION *buffer = NULL;
   
+  vector_PRECISION_init(&v_buf);
+
   PUBLIC_MALLOC( buffer, complex_PRECISION, 2*l->num_eig_vect );
       
   START_LOCKED_MASTER(threading)
@@ -466,8 +446,8 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre
     set_kcycle_tol_PRECISION( g.coarse_tol, l );
   END_LOCKED_MASTER(threading)
   SYNC_MASTER_TO_ALL(threading)
-  
-  PUBLIC_MALLOC( v_buf, complex_PRECISION, l->vector_size );
+
+  vector_PRECISION_alloc( &v_buf, _ORDINARY, 1, l, threading ); 
   
   if ( !l->idle ) {
     for ( int j=0; j<setup_iter; j++ ) {
@@ -486,7 +466,7 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre
       gram_schmidt_PRECISION( l->is_PRECISION.test_vector, buffer, 0, l->num_eig_vect, l, threading );
       
       for ( int i=0; i<l->num_eig_vect; i++ ) {
-        vcycle_PRECISION( l->p_PRECISION.x, NULL, l->is_PRECISION.test_vector[i], _NO_RES, l, threading );
+        vcycle_PRECISION( &(l->p_PRECISION.x), NULL, &(l->is_PRECISION.test_vector[i]), _NO_RES, l, threading );
         
         test_vector_PRECISION_update( i, l, threading );
         
@@ -515,8 +495,8 @@ void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thre
       ((double)l->setup_iter))), l->next_level, threading );
     }
   }
-  
-  PUBLIC_FREE( v_buf, complex_PRECISION, l->vector_size );
+ 
+  vector_PRECISION_free( &v_buf, l, threading );
   PUBLIC_FREE( buffer, complex_PRECISION, 2*l->num_eig_vect );
   
   if ( l->depth == 0 ) {
@@ -537,12 +517,12 @@ void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct
   printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth );
   for ( int i=0; i<l->num_eig_vect; i++ ) {
     printf0("vector #%02d: ", i+1 );
-    apply_operator_PRECISION( l->vbuf_PRECISION[3], test_vectors[i], &(l->p_PRECISION), l, no_threading );
-    coarse_gamma5_PRECISION( l->vbuf_PRECISION[0], l->vbuf_PRECISION[3], 0, l->inner_vector_size, l );
-    lambda = global_inner_product_PRECISION( test_vectors[i], l->vbuf_PRECISION[0], 0, l->inner_vector_size, l, no_threading );
-    lambda /= global_inner_product_PRECISION( test_vectors[i], test_vectors[i], 0, l->inner_vector_size, l, no_threading );
-    vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l );
-    mu = global_norm_PRECISION( l->vbuf_PRECISION[1], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading );
+    apply_operator_PRECISION( &(l->vbuf_PRECISION[3]), test_vectors+i, &(l->p_PRECISION), l, no_threading );
+    coarse_gamma5_PRECISION( &(l->vbuf_PRECISION[0]), &(l->vbuf_PRECISION[3]), 0, l->inner_vector_size, l );
+    lambda = global_inner_product_PRECISION( test_vectors+i, &(l->vbuf_PRECISION[0]), 0, l->inner_vector_size, l, no_threading );
+    lambda /= global_inner_product_PRECISION( test_vectors+i, test_vectors+i, 0, l->inner_vector_size, l, no_threading );
+    vector_PRECISION_saxpy( &(l->vbuf_PRECISION[1]), &(l->vbuf_PRECISION[0]), test_vectors+i, -lambda, 0, l->inner_vector_size, l );
+    mu = global_norm_PRECISION( &(l->vbuf_PRECISION[1]), 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors+i, 0, l->inner_vector_size, l, no_threading );
     printf0("singular value: %+lf%+lfi, singular vector precision: %le\n", (double)creal(lambda), (double)cimag(lambda), (double)mu );
   }
   printf0("--------------------------------------- depth: %d ----------------------------------------\n", l->depth );
diff --git a/src/setup_generic.h b/src/setup_generic.h
index 6d0ae49..c2926a2 100644
--- a/src/setup_generic.h
+++ b/src/setup_generic.h
@@ -26,7 +26,7 @@
   
   void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *threading );
   void coarse_grid_correction_PRECISION_free( level_struct *l );
-  void interpolation_PRECISION_define( vector_double* V, level_struct *l, struct Thread *threading );
+  void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct Thread *threading );
   void iterative_PRECISION_setup( int setup_iter, level_struct *l, struct Thread *threading );
   void re_setup_PRECISION( level_struct *l, struct Thread *threading );
   void inv_iter_inv_fcycle_PRECISION( int setup_iter, level_struct *l, struct Thread *threading );
diff --git a/src/solver_analysis.c b/src/solver_analysis.c
index 325165e..1c1a20b 100644
--- a/src/solver_analysis.c
+++ b/src/solver_analysis.c
@@ -50,6 +50,11 @@ void test_routine( level_struct *l, struct Thread *threading ) {
       if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading );
     }
     
+ /*   if ( g.mixed_precision )
+      vector_float_test_routine( l, threading );
+    else
+      vector_double_test_routine( l, threading );
+*/
     if ( g.interpolation && g.method > 0 ) {
       if ( g.mixed_precision )
         coarse_operator_float_test_routine( l, threading );
diff --git a/src/sse_blas_vectorized.h b/src/sse_blas_vectorized.h
deleted file mode 100644
index df99468..0000000
--- a/src/sse_blas_vectorized.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_BLAS_VECTORIZED_H
-#define SSE_BLAS_VECTORIZED_H
-#ifdef SSE
-
-static inline void sse_cgem_inverse( const int N, float *A_inverse, float *A, int lda ) {
-  // generate LU decomp in A
-  
-  int i, j, k;
-  complex_float alpha;
-  
-  complex_float tmpA[N*N];
-  complex_float tmpA_inverse[N*N];
-  
-  for ( j=0; j<N; j++ ) {
-    for ( i=0; i<N; i++ ) {
-      tmpA[i+N*j] = A[2*j*lda+i] + _Complex_I * A[(2*j+1)*lda+i];
-    }
-  }
-  
-  // LU decomp in A
-  for ( k=0; k<N-1; k++ ) {
-    for ( i=k+1; i<N; i++ ) {
-      // alpha = A_ik/A_kk
-      alpha = tmpA[i+k*N]/tmpA[k+k*N];
-      tmpA[i+k*N] = alpha;
-      for ( j=k+1; j<N; j++ ) {
-        // A_ij = A_ij - alpha * A_kj
-        tmpA[i+j*N] -= alpha* tmpA[k+j*N];
-      }
-    }    
-  } 
-  
-  complex_float b[N];
-  complex_float *x;
-  
-  for ( k=0; k<N; k++ ) {
-    b[k] = 0;
-  }
-  
-  for ( k=0; k<N; k++ ) {
-    x = tmpA_inverse+k*N;
-    b[k] = 1;
-    if ( k>0 )
-      b[k-1] = 0;
-    
-    for ( i=0; i<N; i++ ) {
-      x[i] = b[i];
-      for ( j=0; j<i; j++ ) {
-        // x_i = x_i - A_ij + x_j
-        x[i] = x[i] - tmpA[i+j*N]*x[j];
-      }
-    } // i
-    
-    for ( i=N-1; i>=0; i-- ) {
-      for ( j=i+1; j<N; j++ ) {
-        // x_i = x_i - A_ij * x_j
-        x[i] = x[i] - tmpA[i+j*N]*x[j];
-      }
-      // x_i = x_i / A_ii
-      x[i] = x[i]/tmpA[i+i*N];
-    } // i
-  } // k
-  
-  for ( j=0; j<N; j++ ) {
-    for ( i=0; i<N; i++ ) {
-      A_inverse[i+2*j*lda] = creal(tmpA_inverse[i+j*N]);
-      A_inverse[i+(2*j+1)*lda] = cimag(tmpA_inverse[i+j*N]);
-    }
-    for ( i=N; i<lda; i++ ) {
-      A_inverse[i+2*j*lda] = 0.0;
-      A_inverse[i+(2*j+1)*lda] = 0.0;
-    }
-  } 
-}
-
-
-static inline void sse_cgemv( const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C ) {
-  int i, j;
-  
-  __m128 A_re;
-  __m128 A_im;
-  __m128 B_re;
-  __m128 B_im;
-  __m128 C_re[lda/SIMD_LENGTH_float];
-  __m128 C_im[lda/SIMD_LENGTH_float];
-  
-  // deinterleaved load
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-    C_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i], C[2*i+2], C[2*i+4], C[2*i+6] );
-    C_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i+1], C[2*i+3], C[2*i+5], C[2*i+7] );
-  }
-  
-  for ( j=0; j<N; j++ ) {
-    // load the j-th complex number in B
-    B_re = _mm_set1_ps( B[2*j] );
-    B_im = _mm_set1_ps( B[2*j+1] );
-    
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-       A_re = _mm_load_ps( A + 2*j*lda + i );
-       A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-       
-       // C += A*B
-       cfmadd(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_float]), &(C_im[i/SIMD_LENGTH_float]) );
-    }
-  }  
-  
-  // interleaves real and imaginary parts and stores the resulting complex numbers in C
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-     A_re = _mm_unpacklo_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     A_im = _mm_unpackhi_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     _mm_store_ps( C+2*i,                   A_re );
-     _mm_store_ps( C+2*i+SIMD_LENGTH_float, A_im );
-  }
-}
-
-static inline void sse_cgenmv( const int N, const OPERATOR_TYPE_float *A,  int lda, const float *B, float *C ) {
-  int i, j;
-  
-  __m128 A_re;
-  __m128 A_im;
-  __m128 B_re;
-  __m128 B_im;
-  __m128 C_re[lda/SIMD_LENGTH_float];
-  __m128 C_im[lda/SIMD_LENGTH_float];
-  
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-    C_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i], C[2*i+2], C[2*i+4], C[2*i+6] );
-    C_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i+1], C[2*i+3], C[2*i+5], C[2*i+7] );
-  }
-  
-  for ( j=0; j<N; j++ ) {
-    
-    B_re = _mm_set1_ps( B[2*j] );
-    B_im = _mm_set1_ps( B[2*j+1] ); 
-    
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-       A_re = _mm_load_ps( A + 2*j*lda + i );
-       A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-       
-       // C -= A*B
-       cfnmadd(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_float]), &(C_im[i/SIMD_LENGTH_float]) );
-    }
-  }  
-  
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-     A_re = _mm_unpacklo_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     A_im = _mm_unpackhi_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     _mm_store_ps( C+2*i,                   A_re );
-     _mm_store_ps( C+2*i+SIMD_LENGTH_float, A_im );
-  }
-}
-
-static inline void sse_cgemv_padded( const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C ) {
-  int i, j, ip;
-
-  int offset = SIMD_LENGTH_float*((padded+SIMD_LENGTH_float-1)/SIMD_LENGTH_float);
-#ifdef HAVE_TM1p1
-  if( g.n_flavours == 2 ) {
-    int jp;
-    __m128 A_re;
-    __m128 A_im;
-    __m128 B1_re;
-    __m128 B1_im;
-    __m128 B2_re;
-    __m128 B2_im;
-    __m128 C1_re[lda/SIMD_LENGTH_float];
-    __m128 C1_im[lda/SIMD_LENGTH_float];
-    __m128 C2_re[lda/SIMD_LENGTH_float];
-    __m128 C2_im[lda/SIMD_LENGTH_float];
-    
-    // deinterleaved load
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-      ip = i%offset + 2*(i/offset)*padded;
-      C1_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip], C[2*ip+2], C[2*ip+4], C[2*ip+6] );
-      C1_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip+1], C[2*ip+3], C[2*ip+5], C[2*ip+7] );
-      C2_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*(ip+padded)], C[2*(ip+padded)+2], C[2*(ip+padded)+4], C[2*(ip+padded)+6] );
-      C2_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*(ip+padded)+1], C[2*(ip+padded)+3], C[2*(ip+padded)+5], C[2*(ip+padded)+7] );
-    }
-    
-    for ( j=0; j<N; j++ ) {
-      // load the j-th complex number in B
-      jp = j + (j/padded)*padded;
-      B1_re = _mm_set1_ps( B[2*jp] );
-      B1_im = _mm_set1_ps( B[2*jp+1] );
-      B2_re = _mm_set1_ps( B[2*(jp+padded)] );
-      B2_im = _mm_set1_ps( B[2*(jp+padded)+1] );
-      
-      for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-        A_re = _mm_load_ps( A + 2*j*lda + i );
-        A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-        
-        // C += A*B
-        cfmadd(A_re, A_im, B1_re, B1_im, &(C1_re[i/SIMD_LENGTH_float]), &(C1_im[i/SIMD_LENGTH_float]) );
-        cfmadd(A_re, A_im, B2_re, B2_im, &(C2_re[i/SIMD_LENGTH_float]), &(C2_im[i/SIMD_LENGTH_float]) );
-      }
-    }  
-    
-    // interleaves real and imaginary parts and stores the resulting complex numbers in C
-    for ( j=0; j<lda/offset; j++ ) {
-      // we save it from last to first in order to avoid overriting issues.
-      for ( i = (j+1)*offset-SIMD_LENGTH_float; i >= j*offset; i -= SIMD_LENGTH_float ) {
-        ip = i%offset + 2*(i/offset)*padded;
-        A_re = _mm_unpacklo_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] );
-        A_im = _mm_unpackhi_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] );
-        _mm_store_ps( C+2*ip,                   A_re );
-        _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im );
-        A_re = _mm_unpacklo_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] );
-        A_im = _mm_unpackhi_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] );
-        _mm_store_ps( C+2*(ip+padded),                   A_re );
-        _mm_store_ps( C+2*(ip+padded)+SIMD_LENGTH_float, A_im );
-      }
-    }
-  } else {
-#endif
-    __m128 A_re;
-    __m128 A_im;
-    __m128 B_re;
-    __m128 B_im;
-    __m128 C_re[lda/SIMD_LENGTH_float];
-    __m128 C_im[lda/SIMD_LENGTH_float];
-    
-    // deinterleaved load
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-      ip = i%offset + (i/offset)*padded;
-      C_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip], C[2*ip+2], C[2*ip+4], C[2*ip+6] );
-      C_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip+1], C[2*ip+3], C[2*ip+5], C[2*ip+7] );
-    }
-    
-    for ( j=0; j<N; j++ ) {
-      // load the j-th complex number in B
-      B_re = _mm_set1_ps( B[2*j] );
-      B_im = _mm_set1_ps( B[2*j+1] );
-      
-      for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-        A_re = _mm_load_ps( A + 2*j*lda + i );
-        A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-        
-        // C += A*B
-        cfmadd(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_float]), &(C_im[i/SIMD_LENGTH_float]) );
-      }
-    }  
-    
-    // interleaves real and imaginary parts and stores the resulting complex numbers in C
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-      ip = i%offset + (i/offset)*padded;
-      A_re = _mm_unpacklo_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-      A_im = _mm_unpackhi_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-      _mm_store_ps( C+2*ip,                   A_re );
-      _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im );
-    }
-#ifdef HAVE_TM1p1
-  }
-#endif
-}
-
-static inline void sse_cgenmv_padded( const int N, const OPERATOR_TYPE_float *A, int lda, int padded, const float *B, float *C ) {
-  int i, j, ip;
-
-  int offset = SIMD_LENGTH_float*((padded+SIMD_LENGTH_float-1)/SIMD_LENGTH_float);
-#ifdef HAVE_TM1p1
-  if( g.n_flavours == 2 ) {
-    int jp;
-    __m128 A_re;
-    __m128 A_im;
-    __m128 B1_re;
-    __m128 B1_im;
-    __m128 B2_re;
-    __m128 B2_im;
-    __m128 C1_re[lda/SIMD_LENGTH_float];
-    __m128 C1_im[lda/SIMD_LENGTH_float];
-    __m128 C2_re[lda/SIMD_LENGTH_float];
-    __m128 C2_im[lda/SIMD_LENGTH_float];
-    
-    // deinterleaved load
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-      ip = i%offset + 2*(i/offset)*padded;
-      C1_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip], C[2*ip+2], C[2*ip+4], C[2*ip+6] );
-      C1_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip+1], C[2*ip+3], C[2*ip+5], C[2*ip+7] );
-      C2_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*(ip+padded)], C[2*(ip+padded)+2], C[2*(ip+padded)+4], C[2*(ip+padded)+6] );
-      C2_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*(ip+padded)+1], C[2*(ip+padded)+3], C[2*(ip+padded)+5], C[2*(ip+padded)+7] );
-    }
-    
-    for ( j=0; j<N; j++ ) {
-      // load the j-th complex number in B
-      jp = j + (j/padded)*padded;
-      B1_re = _mm_set1_ps( B[2*jp] );
-      B1_im = _mm_set1_ps( B[2*jp+1] );
-      B2_re = _mm_set1_ps( B[2*(jp+padded)] );
-      B2_im = _mm_set1_ps( B[2*(jp+padded)+1] );
-      
-      for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-        A_re = _mm_load_ps( A + 2*j*lda + i );
-        A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-        
-        // C += A*B
-        cfnmadd(A_re, A_im, B1_re, B1_im, &(C1_re[i/SIMD_LENGTH_float]), &(C1_im[i/SIMD_LENGTH_float]) );
-        cfnmadd(A_re, A_im, B2_re, B2_im, &(C2_re[i/SIMD_LENGTH_float]), &(C2_im[i/SIMD_LENGTH_float]) );
-      }
-    }  
-    
-    // interleaves real and imaginary parts and stores the resulting complex numbers in C
-    for ( j=0; j<lda/offset; j++ ) {
-      // we save it from last to first in order to avoid overriting issues.
-      for ( i = (j+1)*offset-SIMD_LENGTH_float; i >= j*offset; i -= SIMD_LENGTH_float ) {
-        ip = i%offset + 2*(i/offset)*padded;
-        A_re = _mm_unpacklo_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] );
-        A_im = _mm_unpackhi_ps( C1_re[i/SIMD_LENGTH_float], C1_im[i/SIMD_LENGTH_float] );
-        _mm_store_ps( C+2*ip,                   A_re );
-        _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im );
-        A_re = _mm_unpacklo_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] );
-        A_im = _mm_unpackhi_ps( C2_re[i/SIMD_LENGTH_float], C2_im[i/SIMD_LENGTH_float] );
-        _mm_store_ps( C+2*(ip+padded),                   A_re );
-        _mm_store_ps( C+2*(ip+padded)+SIMD_LENGTH_float, A_im );
-      }
-    }
-  } else {
-#endif
-    __m128 A_re;
-    __m128 A_im;
-    __m128 B_re;
-    __m128 B_im;
-    __m128 C_re[lda/SIMD_LENGTH_float];
-    __m128 C_im[lda/SIMD_LENGTH_float];
-    
-    // deinterleaved load
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-      ip = i%offset + (i/offset)*padded;
-      C_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip], C[2*ip+2], C[2*ip+4], C[2*ip+6] );
-      C_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*ip+1], C[2*ip+3], C[2*ip+5], C[2*ip+7] );
-    }
-    
-    for ( j=0; j<N; j++ ) {
-      // load the j-th complex number in B
-      B_re = _mm_set1_ps( B[2*j] );
-      B_im = _mm_set1_ps( B[2*j+1] );
-      
-      for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-        A_re = _mm_load_ps( A + 2*j*lda + i );
-        A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-        
-        // C += A*B
-        cfnmadd(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_float]), &(C_im[i/SIMD_LENGTH_float]) );
-      }
-    }  
-    
-    // interleaves real and imaginary parts and stores the resulting complex numbers in C
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-      ip = i%offset + (i/offset)*padded;
-      A_re = _mm_unpacklo_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-      A_im = _mm_unpackhi_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-      _mm_store_ps( C+2*ip,                   A_re );
-      _mm_store_ps( C+2*ip+SIMD_LENGTH_float, A_im );
-    }
-#ifdef HAVE_TM1p1
-  }
-#endif
-}
-
-#endif
-#endif
diff --git a/src/sse_coarse_operator.h b/src/sse_coarse_operator.h
deleted file mode 100644
index 305dd95..0000000
--- a/src/sse_coarse_operator.h
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef COARSE_OPERATOR_SSE_H
-#define COARSE_OPERATOR_SSE_H
-#ifdef SSE
-
-static inline void sse_set_coarse_self_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, level_struct *l, int site, const int n_rhs, complex_float *tmp );
-static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp );
-static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2,
-    complex_float *phi, config_float clover, int elements, level_struct *l );
-
-// not implemented for double precision
-static inline void sse_set_coarse_self_coupling_double( complex_double *spin_0_1, complex_double *spin_2_3,
-    complex_double *V, level_struct *l, int site, const int n_rhs, complex_double *tmp ) {}
-static inline void sse_set_coarse_neighbor_coupling_double( complex_double *spin_0_1, complex_double *spin_2_3,
-    complex_double *V, const int mu, level_struct *l, int site, const int n_rhs, complex_double *tmp ) {}
-static inline void sse_coarse_spinwise_site_self_couplings_double( complex_double *eta1, complex_double *eta2,
-    complex_double *phi, config_double clover, int elements, level_struct *l ) {}
-static inline void sse_coarse_aggregate_block_diagonal_double( complex_double *eta1, complex_double *eta2,
-    complex_double *phi, config_double clover, int elements, level_struct *l ) {}
-static inline void sse_set_coarse_block_diagonal_double( complex_double *spin_0_1, complex_double *spin_2_3,
-    complex_double *V, level_struct *l, int site, const int n_rhs, complex_double *tmp ) {}
-
-
-static inline void sse_set_coarse_self_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, level_struct *l, int site, const int n_rhs, complex_float *tmp ) {
-
-#ifdef SSE
-  int k, m, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2,
-      offset = l->num_lattice_site_var/2;
-  float *spin_0_1_pt;
-  float *spin_2_3_pt;
-  float *interpolation_data;
-
-  int component_offset = SIMD_LENGTH_float;
-  int fine_components = l->num_lattice_site_var;
-
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    int max = SIMD_LENGTH_float*((n+1+SIMD_LENGTH_float-1)/SIMD_LENGTH_float);
-    spin_0_1_pt = (float *)(spin_0_1 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-    spin_2_3_pt = (float *)(spin_2_3 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-
-
-    // index k used for vectorization
-    // original loop runs to k<=n, we must pad as usual to fill SIMD
-    for ( k=0; k<max; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      // this are the packed indices, which we do not use in tmp
-      //k1 = (n*(n+1))/2;
-      //k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
-      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*SIMD_LENGTH_float*site);
-
-      // A
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // D
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-
-    // index k used for vectorization
-    for ( k=0; k<OPERATOR_COMPONENT_OFFSET_float; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      // this are the packed indices, which we do not use in tmp
-      //k1 = component_offset*(num_eig_vect+1+n);
-      k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*component_offset*site);
-
-      // B
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp ) {
-
-#ifdef SSE
-  int k, k1, k2, m, num_eig_vect = l->next_level->num_lattice_site_var/2,
-      offset = l->num_lattice_site_var/2;
-
-  float *spin_0_1_pt;
-  float *spin_2_3_pt;
-  float *interpolation_data;
-
-  int component_offset = SIMD_LENGTH_float;
-  int fine_components = l->num_lattice_site_var;
-
-  // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-  //             C D ]                        -B*  D* ]
-  // storage order: A, C, B, D, each column wise
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    spin_0_1_pt = (float *)(spin_0_1 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-    spin_2_3_pt = (float *)(spin_2_3 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-
-    // index k used for vectorization
-    for ( k=0; k<OPERATOR_COMPONENT_OFFSET_float; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*component_offset*site);
-
-      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      // A
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // C
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-
-      k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      // B
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // D
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2,
-    complex_float *phi, config_float clover, int elements, level_struct *l ) {
-
-#ifdef SSE
-  int num_eig_vect = l->num_lattice_site_var/2;
-  int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2;
-  complex_float *eta[2] = {eta1, eta2};
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-
-  __m128 clover_re;
-  __m128 clover_im;
-  __m128 in_re;
-  __m128 in_im;
-  __m128 out_re;
-  __m128 out_im;
-
-  // zero output matrices
-  __m128 zero = _mm_setzero_ps();
-  for(int s=0; s<2; s++) {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int row=0; row<2*num_eig_vect; row++) {
-        _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, zero);
-        _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, zero);
-      }
-    }
-  }
-
-  // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-  eta[1] += num_eig_vect*elements;
-  for(int s=0; s<2; s++) {
-    // A and D: column major hermitian, stored as upper triangular
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<=column; row++) {
-          out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-          clover_re = _mm_set1_ps(creal(clover[(column*column+column)/2+row]));
-          clover_im = _mm_set1_ps(cimag(clover[(column*column+column)/2+row]));
-
-          cfmadd(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-        }
-        for(int row=column+1; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-          clover_re = _mm_set1_ps(creal(clover[(row*row+row)/2+column]));
-          clover_im = _mm_set1_ps(cimag(clover[(row*row+row)/2+column]));
-
-          cfmadd_conj(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    clover += clover_step_size1;
-    phi += num_eig_vect*elements;
-  }
-  // rewind phi back to upper components
-  phi -= 2*num_eig_vect*elements;
-  eta[0] += num_eig_vect*elements;
-  eta[1] -= num_eig_vect*elements;
-  // C = -B^{\dagger}
-  for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-    for(int column=0; column<num_eig_vect; column++) {
-      in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-      in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-      for(int row=0; row<num_eig_vect; row++) {
-        out_re = _mm_load_ps((float *)eta[0] + i + (2*row+0)*elements);
-        out_im = _mm_load_ps((float *)eta[0] + i + (2*row+1)*elements);
-        // load transposed B
-        clover_re = _mm_set1_ps(creal(clover[row*num_eig_vect+column]));
-        clover_im = _mm_set1_ps(cimag(clover[row*num_eig_vect+column]));
-
-        cfnmadd_conj(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)eta[0] + i + (2*row+0)*elements, out_re);
-        _mm_store_ps((float *)eta[0] + i + (2*row+1)*elements, out_im);
-      }
-    }
-  }
-  phi += num_eig_vect*elements;
-  // B
-  for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-    for(int column=0; column<num_eig_vect; column++) {
-      in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-      in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-      for(int row=0; row<num_eig_vect; row++) {
-        out_re = _mm_load_ps((float *)eta[1] + i + (2*row+0)*elements);
-        out_im = _mm_load_ps((float *)eta[1] + i + (2*row+1)*elements);
-        clover_re = _mm_set1_ps(creal(clover[column*num_eig_vect+row]));
-        clover_im = _mm_set1_ps(cimag(clover[column*num_eig_vect+row]));
-
-        cfmadd(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)eta[1] + i + (2*row+0)*elements, out_re);
-        _mm_store_ps((float *)eta[1] + i + (2*row+1)*elements, out_im);
-      }
-    }
-  }
-#endif
-}
-
-static inline void sse_coarse_aggregate_block_diagonal_float( complex_float *eta1, complex_float *eta2,
-    complex_float *phi, config_float block, int elements, level_struct *l ) {
-
-#ifdef SSE
-  int num_eig_vect = l->num_parent_eig_vect;
-  int block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
-  complex_float *eta[2] = {eta1, eta2};
-  // U(x) = [ A 0      , A=A*, D=D*
-  //          0 D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-
-  __m128 block_re;
-  __m128 block_im;
-  __m128 in_re;
-  __m128 in_im;
-  __m128 out_re;
-  __m128 out_im;
-
-  // zero output matrices
-  __m128 zero = _mm_setzero_ps();
-  for(int s=0; s<2; s++) {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int row=0; row<2*num_eig_vect; row++) {
-        _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, zero);
-        _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, zero);
-      }
-    }
-  }
-
-  // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-  eta[1] += num_eig_vect*elements;
-  for(int s=0; s<2; s++) {
-    // A and D: column major hermitian, stored as upper triangular
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<=column; row++) {
-          out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-          block_re = _mm_set1_ps(creal(block[(column*column+column)/2+row]));
-          block_im = _mm_set1_ps(cimag(block[(column*column+column)/2+row]));
-
-          cfmadd(block_re, block_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-        }
-        for(int row=column+1; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-          block_re = _mm_set1_ps(creal(block[(row*row+row)/2+column]));
-          block_im = _mm_set1_ps(cimag(block[(row*row+row)/2+column]));
-
-          cfmadd_conj(block_re, block_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    block += block_step_size;
-    phi += num_eig_vect*elements;
-  }
-#endif
-}
-
-static inline void sse_set_coarse_block_diagonal_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, level_struct *l, int site, const int n_rhs, complex_float *tmp ) {
-
-#ifdef SSE
-  int k, m, k1, k2, num_eig_vect = l->next_level->num_parent_eig_vect,
-      offset = l->num_parent_eig_vect;
-  float *spin_0_1_pt;
-  float *spin_2_3_pt;
-  float *interpolation_data;
-
-  int component_offset = SIMD_LENGTH_float;
-  int fine_components = l->num_lattice_site_var;
-
-  // U(x) = [ A 0      , A=A*, D=D*
-  //          0 D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    int max = SIMD_LENGTH_float*((n+1+SIMD_LENGTH_float-1)/SIMD_LENGTH_float);
-    spin_0_1_pt = (float *)(spin_0_1 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-    spin_2_3_pt = (float *)(spin_2_3 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-
-
-    // index k used for vectorization
-    // original loop runs to k<=n, we must pad as usual to fill SIMD
-    for ( k=0; k<max; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      // this are the packed indices, which we do not use in tmp
-      //k1 = (n*(n+1))/2;
-      //k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
-      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*SIMD_LENGTH_float*site);
-
-      // A
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // D
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-  }
-#endif
-}
-
-#endif //SSE
-#endif
diff --git a/src/sse_coarse_operator_generic.c b/src/sse_coarse_operator_generic.c
deleted file mode 100644
index cf3b73d..0000000
--- a/src/sse_coarse_operator_generic.c
+++ /dev/null
@@ -1,962 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#include "sse_coarse_operator.h"
-
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
-void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ) {
-  
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-
-  double t0, t1;
-  t0 = MPI_Wtime();
-
-  int mu, j, n = l->num_eig_vect, num_aggregates = l->is_PRECISION.num_agg,
-      aggregate_sites = l->num_inner_lattice_sites / num_aggregates,
-      clover_site_size = (l->num_eig_vect*(l->num_eig_vect*2+1)),
-      block_site_size = (l->num_eig_vect*(l->num_eig_vect+1)),
-      D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4,  // size of links in all 4 directions
-      fine_components = l->num_lattice_site_var;
-
-
-
-  START_LOCKED_MASTER(threading)
-  operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-
-  // each thread loops overs its aggregates and then over internal d.o.f.
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-    for ( j=0; j<D_link_size; j++ )
-      l->next_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO;
-    for ( j=0; j<clover_site_size; j++ )
-      l->next_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO;
-    for ( j=0; j<block_site_size; j++ )
-      l->next_level->op_PRECISION.odd_proj[j+a*block_site_size] = _COMPLEX_PRECISION_ZERO;
-  }
-
-  complex_PRECISION *mpi_buffer = NULL;
-  START_MASTER(threading)
-  MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 );
-  END_MASTER(threading)
-
-  int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]];
-
-  // set up table for direction flags
-  int *flags = direction_flags;
-  if(l->depth == 0) {
-    // even sites
-    for(int t=0; t < l->block_lattice[T]; t++) {
-      for(int z=0; z < l->block_lattice[Z]; z++) {
-        for(int y=0; y < l->block_lattice[Y]; y++) {
-          for(int x=0; x < l->block_lattice[X]; x++) {
-            if((x+y+z+t)%2 == 0) {
-              flags[2*X+0] = (x ==                     0)?0:1;
-              flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
-              flags[2*Y+0] = (y ==                     0)?0:1;
-              flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
-              flags[2*Z+0] = (z ==                     0)?0:1;
-              flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
-              flags[2*T+0] = (t ==                     0)?0:1;
-              flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
-              flags += 8;
-            }
-          }
-        }
-      }
-    }
-    // odd sites
-    for(int t=0; t < l->block_lattice[T]; t++) {
-      for(int z=0; z < l->block_lattice[Z]; z++) {
-        for(int y=0; y < l->block_lattice[Y]; y++) {
-          for(int x=0; x < l->block_lattice[X]; x++) {
-            if((x+y+z+t)%2 == 1) {
-              flags[2*X+0] = (x ==                     0)?0:1;
-              flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
-              flags[2*Y+0] = (y ==                     0)?0:1;
-              flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
-              flags[2*Z+0] = (z ==                     0)?0:1;
-              flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
-              flags[2*T+0] = (t ==                     0)?0:1;
-              flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
-              flags += 8;
-            }
-          }
-        }
-      }
-    }
-    } else {
-    for(int t=0; t < l->block_lattice[T]; t++) {
-      for(int z=0; z < l->block_lattice[Z]; z++) {
-        for(int y=0; y < l->block_lattice[Y]; y++) {
-          for(int x=0; x < l->block_lattice[X]; x++) {
-            flags[2*X+0] = (x ==                     0)?0:1;
-            flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
-            flags[2*Y+0] = (y ==                     0)?0:1;
-            flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
-            flags[2*Z+0] = (z ==                     0)?0:1;
-            flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
-            flags[2*T+0] = (t ==                     0)?0:1;
-            flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
-            flags += 8;
-          }
-        }
-      }
-    }
-  }
-
-  complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
-  complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
-  complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
-
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-
-    // new aggregate is starting, zero out tmp
-    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
-      tmp[i] = 0.0;
-
-    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
-      if(l->depth == 0) {
-        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-          d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-              operator+c*l->vector_size, &(l->s_PRECISION), l, site,
-              direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) );
-      } else {
-        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-          coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-              operator+c*l->vector_size, &(l->s_PRECISION), l, site,
-              direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) );
-      }
-      set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp );
-    }
-
-    // aggregate is done, finalize
-    set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp );
-
-  }
-
-
-  SYNC_HYPERTHREADS(threading)
-  START_LOCKED_MASTER(threading)
-  // neighbors
-  for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // determine start of buffer for this mu
-      int start = 0;
-      for ( int j=0; j<mu; j++ )
-        start += l->s_PRECISION.op.c.num_boundary_sites[2*j];
-
-      // update ghost cells of V[i]
-      negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l,
-          SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // finish updating ghostcells of V[i]
-      negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-
-
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-
-    // new aggregate is starting, zero out tmp
-    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
-      tmp[i] = 0.0;
-
-    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
-      for ( mu=0; mu<4; mu++ ) {
-        if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0)
-          continue;
-
-        if(l->depth == 0)
-          for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-            d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-                operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site );
-        else
-          for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-            coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-                operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site );
-        set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION );
-      }
-    }
-
-    // aggregate is done, finalize
-    for ( mu=0; mu<4; mu++ )
-      set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION );
-  }
-
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-
-    // new aggregate is starting, zero out tmp
-    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
-      tmp[i] = 0.0;
-
-    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
-      if(l->depth == 0) {
-        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-          diagonal_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-                                                   operator+c*l->vector_size, &(l->s_PRECISION), l, site );
-      } else {
-        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-          coarse_aggregate_block_diagonal_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-              operator+c*l->vector_size, &(l->s_PRECISION), l, site );
-      }
-      set_coarse_block_diagonal_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp );
-    }
-
-    // aggregate is done, finalize
-    set_coarse_block_diagonal_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp );
-  }
-
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-
-  coarse_operator_PRECISION_setup_finalize( l, threading );
-
-  START_MASTER(threading)
-  FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) );
-
-  t1 = MPI_Wtime();
-  if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 );
-  END_MASTER(threading)
-
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-}
-#endif
- 
-void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
-    complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
-
-  sse_set_coarse_self_coupling_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp );
-}
-
-void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
-    complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
-
-  sse_set_coarse_block_diagonal_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp );
-}
-
-void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
-
-  int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
-      num_eig_vect = l->next_level->num_lattice_site_var/2,
-      aggregate_size = l->inner_vector_size / num_aggregates,
-      clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2;
-  int t1, t2;
-
-  config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover;
-
-  // just an abbreviation
-  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-  int fine_components = l->num_lattice_site_var;
-
-  int aggregate = (fine_components*site)/aggregate_size;
-  clover_pt = clover + aggregate*clover_site_size;
-
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    // index k used for vectorization
-    for ( k=0; k<=n; k++ ) {
-
-      k1 = (n*(n+1))/2;
-      k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
-      t1 = (n+0*num_eig_vect)*component_offset;
-      t2 = (n+1*num_eig_vect)*component_offset;
-
-      // A
-      clover_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // D
-      clover_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-    }
-
-    // index k used for vectorization
-    for ( k=0; k<num_eig_vect; k++ ) {
-
-      k1 = num_eig_vect*(num_eig_vect+1+n);
-      t1 = (n+2*num_eig_vect)*component_offset;
-
-      // B
-      clover_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-    }
-  }
-}
-
-
-void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, 
-                                                        complex_PRECISION *V, const int mu, level_struct *l, int site,
-                                                        const int n_rhs, complex_PRECISION *tmp ) {
-
-  sse_set_coarse_neighbor_coupling_PRECISION( spin_0_1, spin_2_3, V, mu, l, site, n_rhs, tmp );
-}
-
-
-void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site,
-                                                                 const int n_rhs, complex_PRECISION *tmp ) {
-
-  int k, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2,
-      D_link_size = num_eig_vect*num_eig_vect*4;
-  int t1, t2;
-
-  config_PRECISION D_pt, D = l->next_level->op_PRECISION.D;
-
-  // just an abbreviation
-  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-  int fine_components = l->num_lattice_site_var;
-
-  int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg);
-  D_pt = D + (4*aggregate+mu)*D_link_size;
-
-  // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-  //             C D ]                        -B*  D* ]
-  // storage order: A, C, B, D, each column wise
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    // index k used for vectorization
-    for ( k=0; k<num_eig_vect; k++ ) {
-
-      k1 = (n+0*num_eig_vect)*num_eig_vect;
-      k2 = (n+1*num_eig_vect)*num_eig_vect;
-      t1 = (n+0*num_eig_vect)*component_offset;
-      t2 = (n+1*num_eig_vect)*component_offset;
-
-      
-      // A
-      D_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // C
-      D_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-
-
-      k1 = (n+2*num_eig_vect)*num_eig_vect;
-      k2 = (n+3*num_eig_vect)*num_eig_vect;
-      t1 = (n+2*num_eig_vect)*component_offset;
-      t2 = (n+3*num_eig_vect)*component_offset;
-
-      // B
-      D_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // D
-      D_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-    }
-  }
-}
-
-void set_coarse_block_diagonal_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
-
-  int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
-      num_eig_vect = l->next_level->num_parent_eig_vect,
-      aggregate_size = l->inner_vector_size / num_aggregates,
-      block_site_size = (l->next_level->num_parent_eig_vect*(l->next_level->num_parent_eig_vect+1));
-  int t1, t2;
-
-  config_PRECISION block_pt, block = l->next_level->op_PRECISION.odd_proj;
-
-  // just an abbreviation
-  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-  int fine_components = l->num_lattice_site_var;
-
-  int aggregate = (fine_components*site)/aggregate_size;
-  block_pt = block + aggregate*block_site_size;
-
-  // U(x) = [ A 0      , A=A*, D=D*
-  //          0 D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    // index k used for vectorization
-    for ( k=0; k<=n; k++ ) {
-
-      k1 = (n*(n+1))/2;
-      k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
-      t1 = (n+0*num_eig_vect)*component_offset;
-      t2 = (n+1*num_eig_vect)*component_offset;
-
-      // A
-      block_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // D
-      block_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-    }
-  }
-}
-
-void copy_coarse_operator_to_vectorized_layout_PRECISION( config_PRECISION D,
-                                                          OPERATOR_TYPE_PRECISION *D_vectorized, 
-                                                          int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout D is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int half_offset = SIMD_LENGTH_PRECISION*((vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int column_offset = 2*half_offset;
-  // offset between blocks in D
-  int block_offset = vecs*vecs;
-
-  PRECISION *out_tmp = D_vectorized;
-
-  // we zero out the padded area (o) to avoid potential floating-point errors
-  // D_vectorized is
-  // AB
-  // oo
-  // CD
-  // oo
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<4*num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        out_tmp[(2*i+0)*column_offset + j] = creal(D[0*block_offset + i*vecs+j]);
-        out_tmp[(2*i+1)*column_offset + j] = cimag(D[0*block_offset + i*vecs+j]);
-        // C
-        out_tmp[(2*i+0)*column_offset + j + half_offset] = creal(D[1*block_offset + i*vecs+j]);
-        out_tmp[(2*i+1)*column_offset + j + half_offset] = cimag(D[1*block_offset + i*vecs+j]);
-      }
-      // zero
-      for(int j=vecs; j<half_offset; j++) {
-        out_tmp[(2*i+0)*column_offset + j] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j] = 0.0;
-        out_tmp[(2*i+0)*column_offset + j + half_offset] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j + half_offset] = 0.0;
-      }
-    }
-
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // B
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = creal(D[2*block_offset + i*vecs+j]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(D[2*block_offset + i*vecs+j]);
-        // D
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = creal(D[3*block_offset + i*vecs+j]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = cimag(D[3*block_offset + i*vecs+j]);
-      }
-      // zero
-      for(int j=vecs; j<half_offset; j++) {
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = 0.0;
-      }
-    }
-    D += 2*vecs*2*vecs;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-  }
-}
-
-
-void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D,
-                                                                     OPERATOR_TYPE_PRECISION *D_vectorized,
-                                                                     int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout D is stored column wise, but not split into ABCD
-  // output is transposed
-  // each column is padded, such that the next column can also start at 64bit boundary
-  int half_offset = SIMD_LENGTH_PRECISION*((vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int column_offset = 2*half_offset;
-  // offset between blocks in D
-  int block_offset = vecs*vecs;
-
-  PRECISION *out_tmp = D_vectorized;
-
-  // we zero out the padded area to avoid potential floating-point errors
-  // D_vectorized is
-  // A^T C^T
-  //  o   o
-  // B^T D^T
-  //  o   o
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<4*num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        out_tmp[(2*i+0)*column_offset + j] = creal(D[0*block_offset + j*vecs+i]);
-        out_tmp[(2*i+1)*column_offset + j] = -cimag(D[0*block_offset + j*vecs+i]);
-        // B
-        out_tmp[(2*i+0)*column_offset + j + half_offset] = -creal(D[2*block_offset + j*vecs+i]);
-        out_tmp[(2*i+1)*column_offset + j + half_offset] = cimag(D[2*block_offset + j*vecs+i]);
-      }
-      // zero
-      for(int j=vecs; j<half_offset; j++) {
-        out_tmp[(2*i+0)*column_offset + j] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j] = 0.0;
-        out_tmp[(2*i+0)*column_offset + j + half_offset] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j + half_offset] = 0.0;
-      }
-    }
-
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // C
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = -creal(D[1*block_offset + j*vecs+i]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(D[1*block_offset + j*vecs+i]);
-        // D
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = creal(D[3*block_offset + j*vecs+i]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = -cimag(D[3*block_offset + j*vecs+i]);
-      }
-      // zero
-      for(int j=vecs; j<half_offset; j++) {
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = 0.0;
-      }
-    }
-    D += 2*vecs*2*vecs;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-  }
-}
-
-
-void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover,
-                                                                OPERATOR_TYPE_PRECISION *clover_vectorized,
-                                                                int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_D = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-  int offset_to_B = 2*offset_to_D; // B comes after A and D
-
-  PRECISION *out_tmp = clover_vectorized;
-
-  // we zero out the padded area to avoid potential floating-point errors
-  // cloverD_vectorized is
-  // AB
-  // CD
-  // 00
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]);
-        out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]);
-        // C = -B^dagger
-        out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]);
-        out_tmp[(2*i+1)*column_offset + j + vecs] =  cimag(clover[offset_to_B + j*vecs+i]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*i+0)*column_offset + j] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j] = 0.0;
-      }
-    }
-
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // B
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = creal(clover[offset_to_B + i*vecs+j]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(clover[offset_to_B + i*vecs+j]);
-        // D
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
-      }
-    }
-    clover += offset_to_B + vecs*vecs;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-  }
-}
-
-void copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(config_PRECISION clover,
-                                                                        OPERATOR_TYPE_PRECISION *clover_vectorized,
-                                                                        int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_D = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-  int offset_to_B = 2*offset_to_D; // B comes after A and D
-
-  PRECISION *out_tmp = clover_vectorized;
-
-  // cloverD_vectorized is
-  // A0B0
-  // 0A0B
-  // C0D0
-  // 0C0D
-  // 0000  we zero out the padded area to avoid potential floating-point errors
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        // A
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_column+jp]);
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] = sign*cimag(clover[offset_to_column+jp]);
-        // B
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_B + i*vecs+j]);
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 0*vecs] = 
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 1*vecs] = cimag(clover[offset_to_B + i*vecs+j]);
-        // C = -B^dagger
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 2*vecs] =
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 3*vecs] = -creal(clover[offset_to_B + j*vecs+i]);
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 2*vecs] =
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 3*vecs] = cimag(clover[offset_to_B + j*vecs+i]);
-        // D
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] = 
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] = creal(clover[offset_to_D + offset_to_column+jp]);
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] = 
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]);
-        // 0
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] =
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] = 
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 3*vecs] =
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 3*vecs] = 
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 2*vecs] =
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 2*vecs] =
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 1*vecs] =
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 1*vecs] =
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] =
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] =
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 0*vecs] =
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] =
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] = 0.0;
-      }
-      // zero
-      for(int j=4*vecs; j<column_offset; j++) {
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j] = 
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j] = 
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j] = 
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j] = 
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j] = 
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j] = 
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j] = 
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j] = 0.0;
-      }
-    }
-    clover += offset_to_B + vecs*vecs;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*4*vecs*column_offset;
-  }
-}
-
-
-void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized,
-                                                int num_aggregates, int num_eig_vect) {
-#ifdef HAVE_TM
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-
-  PRECISION *out_tmp = clover_vectorized;
-
-  // we add the tm term to cloverD_vectorized
-  // AB   E0
-  // CD + 0F
-  // 00   00
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        // E
-        out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]);
-        out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]);
-        // F
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_F + offset_to_column+jp]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_F + offset_to_column+jp]);
-      }
-    }
-    tm_term += 2*offset_to_F;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-  }
-#endif
-}
-
-void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized,
-                                                int num_aggregates, int num_eig_vect) {
-#ifdef HAVE_TM
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-
-  PRECISION *out_tmp = clover_vectorized;
-
-  // we add/sub the tm term to cloverD_vectorized
-  // A0B0   E000   0000
-  // 0A0B + 0000 - 0E00
-  // C0D0   00F0   0000
-  // 0C0D   0000   000F
-  // 0000   0000   0000
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        // E
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(tm_term[offset_to_column+jp]);
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] += cimag(tm_term[offset_to_column+jp]); 
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] -= sign*creal(tm_term[offset_to_column+jp]);
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] -= cimag(tm_term[offset_to_column+jp]);
-        // F
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(tm_term[offset_to_F+offset_to_column+jp]);
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] += cimag(tm_term[offset_to_F+offset_to_column+jp]);
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] -= sign*creal(tm_term[offset_to_F+offset_to_column+jp]);
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] -= cimag(tm_term[offset_to_F+offset_to_column+jp]);
-     }
-    }
-    tm_term += 2*offset_to_F;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*4*vecs*column_offset;
-  }
-#endif
-}
-
-void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, OPERATOR_TYPE_PRECISION *clover_vectorized,
-                                                            int num_aggregates, int num_eig_vect) {
-#ifdef HAVE_TM1p1
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-
-  PRECISION *out_tmp = clover_vectorized;
-
-  // we add the eps term to cloverD_vectorized
-  // A0B0   0E00
-  // 0A0B + E000
-  // C0D0   000F
-  // 0C0D   00F0
-  // 0000   0000
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        // E
-        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] += sign*creal(eps_term[offset_to_column+jp]);
-        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] += cimag(eps_term[offset_to_column+jp]); 
-        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(eps_term[offset_to_column+jp]);
-        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] += cimag(eps_term[offset_to_column+jp]);
-        // F
-        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]);
-        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]);
-        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]);
-        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]);
-     }
-    }
-    eps_term += 2*offset_to_F;
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*4*vecs*column_offset;
-  }
-#endif
-}
-
-void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
-                                                           complex_PRECISION *phi, schwarz_PRECISION_struct *s,
-                                                           level_struct *l, int site, int *direction_flags ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = l->num_lattice_site_var*offset;
-  int index_bw;
-  int index_fw;
-  int *neighbor = s->op.neighbor_table;
-  int *backward_neighbor = s->op.backward_neighbor_table;
-  complex_PRECISION *phi_pt;
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D;
-  int n = l->num_lattice_site_var;
-  int D_site_offset = 4*n*n;
-  int D_link_offset = n*n;
-  int clover_offset = (n*(n+1))/2*site;
-
-  coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l );
-
-  for(int mu=0; mu<4; mu++) {
-    index_fw  = neighbor[5*site+1 + mu];
-    index_bw  = backward_neighbor[5*site+1 + mu];
-
-    // from backward
-    if ( direction_flags[2*mu+0] == 1 ) {
-      D_pt = D + D_site_offset*index_bw + D_link_offset*mu;
-      phi_pt = phi + site_offset*index_bw;
-      coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
-    }
-
-    // from forward
-    if ( direction_flags[2*mu+1] == 1 ) {
-      D_pt = D + D_site_offset*site + D_link_offset*mu;
-      phi_pt = phi + site_offset*index_fw;
-      coarse_spinwise_n_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
-    }
-  }
-}
-
-void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
-                                                           complex_PRECISION *phi, schwarz_PRECISION_struct *s,
-                                                           level_struct *l, int site ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = l->num_lattice_site_var*offset;
-  int n = l->num_parent_eig_vect;
-  int block_offset = (n*(n+1))*site;
- 
-  sse_coarse_aggregate_block_diagonal_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+block_offset, offset, l );
-}
-
-void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
-                                                               complex_PRECISION *phi, const int mu,
-                                                               schwarz_PRECISION_struct *s, level_struct *l, int site ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = l->num_lattice_site_var*offset;
-  int index_fw;
-  int *neighbor = s->op.neighbor_table;
-  complex_PRECISION *phi_pt;
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D;
-  int n = l->num_lattice_site_var;
-  int D_site_offset = 4*n*n;
-  int D_link_offset = n*n;
-
-  vector_PRECISION_define( eta1, 0, 0, n*offset, l );
-  vector_PRECISION_define( eta2, 0, 0, n*offset, l );
-
-  // requires the positive boundaries of phi to be communicated before
-  index_fw  = neighbor[5*site+1 + mu];
-  D_pt = D + D_site_offset*site + D_link_offset*mu;
-  phi_pt = phi + site_offset*index_fw;
-  coarse_spinwise_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
-}
-
-
-void coarse_spinwise_site_self_couplings_PRECISION_vectorized(
-    complex_PRECISION *eta1, complex_PRECISION *eta2,
-    complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) {
-  
-  sse_coarse_spinwise_site_self_couplings_PRECISION( eta1, eta2, phi, clover, elements, l );
-}
-
-#endif
diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h
deleted file mode 100644
index fb7391a..0000000
--- a/src/sse_coarse_operator_generic.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_COARSE_OPERATOR_PRECISION_HEADER
-  #define SSE_COARSE_OPERATOR_PRECISION_HEADER
-
-  #ifdef SSE
-  
-  #include "blas_vectorized.h"
-  
-  void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading );
-  void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
-      complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  // here we do not check whether site is really on boundary, caller is responsible for that
-  // tmp is used to store coarse operator with padding, until sum over all sites has been done
-  void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *buffer1, complex_PRECISION *buffer2,
-      complex_PRECISION *V, const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
-      complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  void set_coarse_block_diagonal_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-
-  void copy_coarse_operator_to_vectorized_layout_PRECISION(config_PRECISION D,
-      OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect);
-  // fw and bw links have a symmetry that allows constructing one from another, see, e.g., coarse_hopp_PRECISION
-  // for vectorization we store the operator for both cases, the "daggered" links need this transformed layout
-  void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D,
-      OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect);
-  void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-  void copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(config_PRECISION clover,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-  void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-  void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-  void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-    
-  void coarse_spinwise_site_self_couplings_PRECISION_vectorized(
-      complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l );
-  
-  void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
-      int site, int *direction_flags );  
-  
-  void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l,
-      int site );  
-
-  void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
-      int site);  
-  
-  
-  static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi,
-      OPERATOR_TYPE_PRECISION *D, level_struct *l ) {
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-    int nv = l->num_parent_eig_vect;
-    int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    cgenmv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta);
-#endif
-  }
-  static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi,
-      OPERATOR_TYPE_PRECISION *D, level_struct *l ) {
-#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
-    int nv = l->num_parent_eig_vect;
-    int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    cgemv_padded( 2*nv, D, lda, nv, (float *)phi, (float *)eta);
-#endif
-  }
-
-  static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, 
-                                                                 operator_PRECISION_struct *op, int start, int end, level_struct *l ) {
-#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
-    int site_size = l->num_lattice_site_var;
-    int lda = SIMD_LENGTH_PRECISION*((site_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-#ifdef HAVE_TM1p1
-    OPERATOR_TYPE_PRECISION *clover = (g.n_flavours == 2) ? op->clover_doublet_vectorized:op->clover_vectorized;
-#else
-    OPERATOR_TYPE_PRECISION *clover = op->clover_vectorized;
-#endif
-    for(int i=start; i<end; i++) {
-      for(int j=0; j<site_size; j++)
-        eta[i*site_size+j] = 0.0;
-      cgemv(site_size, clover+i*2*site_size*lda, lda, (float *)(phi+i*site_size), (float *)(eta+i*site_size));
-    }
-#endif
-  }
-
-  static inline void coarse_spinwise_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
-
-#ifdef SSE
-    int num_eig_vect = l->num_lattice_site_var/2;
-    int num_eig_vect2 = num_eig_vect*num_eig_vect;
-    complex_PRECISION *eta[2] = {eta1, eta2};
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-
-    __m128 D_re;
-    __m128 D_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-    for(int s=0; s<2; s++) {
-      // t is the row of the input matrix (in 2x2 block form)
-      for(int t=0; t<2; t++) {
-        for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-          for(int column=0; column<num_eig_vect; column++) {
-            in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-            in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-            for(int row=0; row<num_eig_vect; row++) {
-              out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-              out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-              D_re = _mm_set1_ps(creal(D[column*num_eig_vect+row]));
-              D_im = _mm_set1_ps(cimag(D[column*num_eig_vect+row]));
-
-              cfmadd(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-              _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-              _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-            }
-          }
-        }
-        eta[s] += num_eig_vect*elements;
-        D += num_eig_vect2;
-      }
-      phi += num_eig_vect*elements;
-    }
-#endif
-  }  
-  
-  static inline void coarse_spinwise_n_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
-
-#ifdef SSE
-    int num_eig_vect = l->num_lattice_site_var/2;
-    int num_eig_vect2 = num_eig_vect*num_eig_vect;
-    complex_PRECISION *eta[2] = {eta1, eta2};
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
-
-    __m128 D_re;
-    __m128 D_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-    for(int s=0; s<2; s++) {
-      // t is the row of the input matrix (in 2x2 block form)
-      for(int t=0; t<2; t++) {
-        for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-          for(int column=0; column<num_eig_vect; column++) {
-            in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-            in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-            for(int row=0; row<num_eig_vect; row++) {
-              out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-              out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-              D_re = _mm_set1_ps(creal(D[column*num_eig_vect+row]));
-              D_im = _mm_set1_ps(cimag(D[column*num_eig_vect+row]));
-
-              cfnmadd(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-              _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-              _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-            }
-          }
-        }
-        eta[s] += num_eig_vect*elements;
-        D += num_eig_vect2;
-      }
-      phi += num_eig_vect*elements;
-    }
-#endif
-  }
-  
-
-  static inline void coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
-
-#ifdef SSE
-    int num_eig_vect = l->num_lattice_site_var/2;
-    int num_eig_vect2 = num_eig_vect*num_eig_vect;
-    complex_PRECISION *eta[2] = {eta1, eta2};
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
-
-    __m128 D_re;
-    __m128 D_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    // A*
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[0] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[0] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfnmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[0] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[0] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    // -C*
-    phi += num_eig_vect*elements;
-    D += num_eig_vect2;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[1] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[1] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[1] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[1] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    // -B*
-    eta[0] += num_eig_vect*elements;
-    phi -= num_eig_vect*elements;
-    D += num_eig_vect2;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[0] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[0] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[0] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[0] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    // D*
-    eta[1] += num_eig_vect*elements;
-    phi += num_eig_vect*elements;
-    D += num_eig_vect2;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[1] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[1] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfnmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[1] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[1] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-#endif
-  }
-  
-#endif
-#endif
diff --git a/src/sse_complex_double_intrinsic.h b/src/sse_complex_double_intrinsic.h
deleted file mode 100644
index c6ff49a..0000000
--- a/src/sse_complex_double_intrinsic.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef COMPLEX_SSE_INTRINSIC_DOUBLE_H
-#define COMPLEX_SSE_INTRINSIC_DOUBLE_H
-
-#ifdef SSE
-#include "sse_double_intrinsic.h"
-
-// c = a*b
-static inline void cmul_pd(
-        __m128d a_real, __m128d a_imag,
-        __m128d b_real, __m128d b_imag,
-        __m128d *c_real, __m128d *c_imag)
-{
-    *c_real = _mm_mul_pd(a_imag, b_imag);
-    *c_imag = _mm_mul_pd(a_imag, b_real);
-    *c_real = sse_fmsub_pd(a_real, b_real, *c_real);
-    *c_imag = sse_fmadd_pd(a_real, b_imag, *c_imag);
-}
-
-// c = conj(a)*b
-static inline void cmul_conj_pd(
-        __m128d a_real, __m128d a_imag,
-        __m128d b_real, __m128d b_imag,
-        __m128d *c_real, __m128d *c_imag)
-{
-    *c_real = _mm_mul_pd(a_imag, b_imag);
-    *c_imag = _mm_mul_pd(a_imag, b_real);
-    *c_real = sse_fmadd_pd(a_real, b_real, *c_real);
-    *c_imag = sse_fmsub_pd(a_real, b_imag, *c_imag);
-}
-
-// c = a*b + c
-static inline void cfmadd_pd(
-        __m128d a_real, __m128d a_imag,
-        __m128d b_real, __m128d b_imag,
-        __m128d *c_real, __m128d *c_imag)
-{
-  *c_real = sse_fmsub_pd( a_imag, b_imag, *c_real );
-  *c_imag = sse_fmadd_pd( a_imag, b_real, *c_imag );
-  *c_real = sse_fmsub_pd( a_real, b_real, *c_real );
-  *c_imag = sse_fmadd_pd( a_real, b_imag, *c_imag );
-}
-
-
-// c = conj(a)*b + c
-static inline void cfmadd_conj_pd(
-        __m128d a_real, __m128d a_imag,
-        __m128d b_real, __m128d b_imag,
-        __m128d *c_real, __m128d *c_imag)
-{
-    *c_real = sse_fmadd_pd(a_imag, b_imag, *c_real);
-    *c_imag = sse_fmsub_pd(a_imag, b_real, *c_imag);
-    *c_real = sse_fmadd_pd(a_real, b_real, *c_real);
-    *c_imag = sse_fmsub_pd(a_real, b_imag, *c_imag);
-}
-
-
-static inline void sse_complex_deinterleaved_load_pd( double *data, __m128d *result_re, __m128d *result_im  ) {  
-  *result_re = _mm_setr_pd( data[0], data[2] );
-  *result_im = _mm_setr_pd( data[1], data[3] );
-}
-
-
-static inline void sse_complex_interleaved_store_pd( __m128d data_re, __m128d data_im, double *result  ) { 
-  _mm_storeu_pd( result,                   _mm_unpacklo_pd( data_re, data_im ) );
-  _mm_storeu_pd( result+SIMD_LENGTH_double, _mm_unpackhi_pd( data_re, data_im ) );
-}
-
-#endif
-#endif
diff --git a/src/sse_complex_float_intrinsic.h b/src/sse_complex_float_intrinsic.h
deleted file mode 100644
index fa7a2fe..0000000
--- a/src/sse_complex_float_intrinsic.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef COMPLEX_SSE_INTRINSIC_H
-#define COMPLEX_SSE_INTRINSIC_H
-
-#ifdef SSE
-#include "sse_float_intrinsic.h"
-
-// c = a*b
-static inline void cmul(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    *c_real = _mm_mul_ps(a_imag, b_imag);
-    *c_imag = _mm_mul_ps(a_imag, b_real);
-    *c_real = sse_fmsub(a_real, b_real, *c_real);
-    *c_imag = sse_fmadd(a_real, b_imag, *c_imag);
-}
-
-// c = -a*b
-static inline void cnmul(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    *c_real = _mm_mul_ps(a_imag, b_imag);
-    *c_imag = _mm_mul_ps(a_imag, b_real);
-    *c_real = sse_fnmadd(a_real, b_real, *c_real);
-    *c_imag = sse_fnmsub(a_real, b_imag, *c_imag);
-}
-
-// c = conj(a)*b
-static inline void cmul_conj(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    *c_real = _mm_mul_ps(a_imag, b_imag);
-    *c_imag = _mm_mul_ps(a_imag, b_real);
-    *c_real = sse_fmadd(a_real, b_real, *c_real);
-    *c_imag = sse_fmsub(a_real, b_imag, *c_imag);
-}
-
-// c = -conj(a)*b
-static inline void cnmul_conj(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    *c_real = _mm_mul_ps(a_imag, b_imag);
-    *c_imag = _mm_mul_ps(a_imag, b_real);
-    *c_real = sse_fnmsub(a_real, b_real, *c_real);
-    *c_imag = sse_fnmadd(a_real, b_imag, *c_imag);
-}
-
-// c = a*b + c
-static inline void cfmadd(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-  *c_real = sse_fmsub( a_imag, b_imag, *c_real );
-  *c_imag = sse_fmadd( a_imag, b_real, *c_imag );
-  *c_real = sse_fmsub( a_real, b_real, *c_real );
-  *c_imag = sse_fmadd( a_real, b_imag, *c_imag );
-}
-
-// c = -a*b + c
-static inline void masked_cfnmadd(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag, __m128 mask )
-{
-    __m128 minus_a_real; __m128 minus_a_imag;
-    minus_a_real = _mm_setzero_ps();
-    minus_a_imag = _mm_sub_ps( minus_a_real, a_imag );
-    minus_a_real = _mm_sub_ps( minus_a_real, a_real );
-    minus_a_real = _mm_and_ps( minus_a_real, mask );
-    minus_a_imag = _mm_and_ps( minus_a_imag, mask );
-    
-    *c_real = sse_fmsub(minus_a_imag, b_imag, *c_real);
-    *c_imag = sse_fmadd(minus_a_imag, b_real, *c_imag);
-    *c_real = sse_fmsub(minus_a_real, b_real, *c_real);
-    *c_imag = sse_fmadd(minus_a_real, b_imag, *c_imag);
-}
-
-// c = -a*b + c
-static inline void cfnmadd(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    __m128 minus_a_real; __m128 minus_a_imag;
-    minus_a_real = _mm_setzero_ps();
-    minus_a_imag = _mm_sub_ps( minus_a_real, a_imag );
-    minus_a_real = _mm_sub_ps( minus_a_real, a_real );
-    
-    *c_real = sse_fmsub(minus_a_imag, b_imag, *c_real);
-    *c_imag = sse_fmadd(minus_a_imag, b_real, *c_imag);
-    *c_real = sse_fmsub(minus_a_real, b_real, *c_real);
-    *c_imag = sse_fmadd(minus_a_real, b_imag, *c_imag);
-}
-
-// c = conj(a)*b + c
-static inline void cfmadd_conj(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    *c_real = sse_fmadd(a_imag, b_imag, *c_real);
-    *c_imag = sse_fmsub(a_imag, b_real, *c_imag);
-    *c_real = sse_fmadd(a_real, b_real, *c_real);
-    *c_imag = sse_fmsub(a_real, b_imag, *c_imag);
-}
-
-// c = -conj(a)*b + c
-static inline void cfnmadd_conj(
-        __m128 a_real, __m128 a_imag,
-        __m128 b_real, __m128 b_imag,
-        __m128 *c_real, __m128 *c_imag)
-{
-    __m128 minus_a_real; __m128 minus_a_imag;
-    minus_a_real = _mm_setzero_ps();
-    minus_a_imag = _mm_sub_ps( minus_a_real, a_imag );
-    minus_a_real = _mm_sub_ps( minus_a_real, a_real );
-  
-    *c_real = sse_fmadd(minus_a_imag, b_imag, *c_real);
-    *c_imag = sse_fmsub(minus_a_imag, b_real, *c_imag);
-    *c_real = sse_fmadd(minus_a_real, b_real, *c_real);
-    *c_imag = sse_fmsub(minus_a_real, b_imag, *c_imag);
-}
-
-static inline void sse_complex_deinterleaved_load( float *data, __m128 *result_re, __m128 *result_im  ) {  
-  *result_re = _mm_setr_ps( data[0], data[2], data[4], data[6] );
-  *result_im = _mm_setr_ps( data[1], data[3], data[5], data[7] );
-}
-
-
-static inline void sse_complex_interleaved_store( __m128 data_re, __m128 data_im, float *result  ) { 
-  _mm_storeu_ps( result,                   _mm_unpacklo_ps( data_re, data_im ) );
-  _mm_storeu_ps( result+SIMD_LENGTH_float, _mm_unpackhi_ps( data_re, data_im ) );
-}
-
-#endif
-#endif
diff --git a/src/sse_dirac.c b/src/sse_dirac.c
deleted file mode 100644
index 5e664d1..0000000
--- a/src/sse_dirac.c
+++ /dev/null
@@ -1,2971 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#define index_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + gamma_offset[mu][spin] ]
-#define index_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] - gamma_offset[mu][spin] +1 ]
-
-// 12*(gamma_co[mu][spin]/2) equivalent to #define flav_gamma(k) ((k)>1?((k)*3+6):((k)*3))
-#define index_d_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) + gamma_offset[mu][spin] ]
-#define index_d_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) - gamma_offset[mu][spin] +1 ]
-
-#define neighbor_coupling_file "sse_dirac_su3local.h"
-
-void prp_double( complex_double *prn[4], complex_double *phi, int start, int end ) {   
-  
-  double *phi_pt = (double*)(phi+start);
-  double *phi_end = (double*)(phi+end);
-  double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)};
-  
-  while ( phi_pt < phi_end ) {
-    
-    __m128d phi_pt1_re; __m128d phi_pt1_im;
-    
-    sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt+4,mu,0), index_re(phi_pt,mu,1) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt+4,mu,0), index_im(phi_pt,mu,1) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    phi_pt += 24;
-  }
-}
-
-
-void prp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { 
-  
-  float *phi_pt = (float*)(phi+start);
-  float *phi_end = (float*)(phi+end);
-  float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)};
-  
-  while ( phi_pt < phi_end ) {
-    
-    __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] );
-    __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] );
-    for ( int mu=0; mu<4; mu++) {
-      __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0),
-                                       index_re(phi_pt+4,mu,0), index_re(phi_pt,mu,1) );
-      __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0),
-                                       index_im(phi_pt+4,mu,0), index_im(phi_pt,mu,1) );
-      
-      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
-      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
-      
-      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
-      pr[mu] += 8;
-    }
-    
-    phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[24], phi_pt[26] );
-    phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[25], phi_pt[27] );
-    for ( int mu=0; mu<4; mu++) {
-      __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1),
-                                       index_re(phi_pt+24,mu,0), index_re(phi_pt+26,mu,0) );
-      __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1),
-                                       index_im(phi_pt+24,mu,0), index_im(phi_pt+26,mu,0) );
-      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
-      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
-      
-      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
-      pr[mu] += 8;
-    }
-    
-    phi_pt1_re = _mm_setr_ps( phi_pt[28], phi_pt[30], phi_pt[32], phi_pt[34] );
-    phi_pt1_im = _mm_setr_ps( phi_pt[29], phi_pt[31], phi_pt[33], phi_pt[35] );
-    for ( int mu=0; mu<4; mu++) {
-      __m128 phi_pt2_re = _mm_setr_ps( index_re(phi_pt+28,mu,0), index_re(phi_pt+24,mu,1),
-                                       index_re(phi_pt+26,mu,1), index_re(phi_pt+28,mu,1) );
-      __m128 phi_pt2_im = _mm_setr_ps( index_im(phi_pt+28,mu,0), index_im(phi_pt+24,mu,1),
-                                       index_im(phi_pt+26,mu,1), index_im(phi_pt+28,mu,1) );
-      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
-      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
-      
-      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
-      pr[mu] += 8;
-    }
-    
-    phi_pt+=48;
-  }
-}
-
-
-void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end ) {   
-  
-  double *phi_pt = (double*)(phi+start);
-  double *phi_end = (double*)(phi+end);
-  double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)};
-  
-  while ( phi_pt < phi_end ) {
-    
-    __m128d phi_pt1_re; __m128d phi_pt1_im;
-    
-    sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-
-    sse_complex_deinterleaved_load_pd( phi_pt+12, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    sse_complex_deinterleaved_load_pd( phi_pt+16, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-    
-    sse_complex_deinterleaved_load_pd( phi_pt+20, &phi_pt1_re, &phi_pt1_im );
-    for ( int mu=0; mu<4; mu++) {
-      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) );
-      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) );
-      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
-      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
-      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
-      pr[mu] += 2*SIMD_LENGTH_double;
-    }
-
-    phi_pt += 48;
-  }
-}
-
-
-void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { 
-  
-  float *phi_pt = (float*)(phi+start);
-  float *phi_end = (float*)(phi+end);
-  float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)};
-  
-  while ( phi_pt < phi_end ) {
-    
-    __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] );
-    __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] );
-    for ( int mu=0; mu<4; mu++) {
-      __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0),
-                                       index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) );
-      __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0),
-                                       index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) );
-      
-      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
-      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
-      
-      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
-      pr[mu] += 8;
-    }
-    
-    phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[12], phi_pt[14] );
-    phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[13], phi_pt[15] );
-    for ( int mu=0; mu<4; mu++) {
-      __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1),
-                                       index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) );
-      __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1),
-                                       index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) );
-      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
-      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
-      
-      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
-      pr[mu] += 8;
-    }
-    
-    phi_pt1_re = _mm_setr_ps( phi_pt[16], phi_pt[18], phi_pt[20], phi_pt[22] );
-    phi_pt1_im = _mm_setr_ps( phi_pt[17], phi_pt[19], phi_pt[21], phi_pt[23] );
-    for ( int mu=0; mu<4; mu++) {
-      __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1),
-                                       index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) );
-      __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1),
-                                       index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) );
-      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
-      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
-      
-      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
-      pr[mu] += 8;
-    }
-    
-    phi_pt+=48;
-  }
-}
-
-
-void prn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) {
-  
-  double *phi_pt = (double*)(phi+start);
-  double *phi_end_pt = (double*)(phi+end);
-  double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
-  double *D_pt = ((double*)(op->D))+2*(start*3);
-  int *nb_pt = neighbor+((start/12)*4);
-  
-  while ( phi_pt < phi_end_pt ) {
-    
-    __m128d in_re[3];
-    __m128d in_im[3];
-    
-    for ( int i=0; i<3; i++ ) {
-      in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] );
-      in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); 
-    }
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      
-      __m128d v_re[3];
-      __m128d v_im[3];
-      
-      // calc spin projection
-      for ( int i=0; i<3; i++ )  {
-        v_re[i] = _mm_setr_pd( index_re(phi_pt+2*i,mu,0), index_re(phi_pt+2*i,mu,1) );
-        v_im[i] = _mm_setr_pd( index_im(phi_pt+2*i,mu,0), index_im(phi_pt+2*i,mu,1) );
-        v_re[i] = _mm_add_pd( in_re[i], v_re[i] );
-        v_im[i] = _mm_add_pd( in_im[i], v_im[i] );
-      }
-      
-      {
-        __m128d res_re[3];
-        __m128d res_im[3];
-        // load su(3) matrix and multiply
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] );
-          cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[6+2*i] );
-          buf_im = _mm_set1_pd( D_pt[7+2*i] );
-          cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[12+2*i] );
-          buf_im = _mm_set1_pd( D_pt[13+2*i] );
-          cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-        }
-        
-        {
-          double *pr_pt = pr[mu]+2*6*(*(nb_pt));
-          for ( int i=0; i<3; i++ ) {
-            __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] );
-            __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] );
-            _mm_storeu_pd( pr_pt+0+2*i, out1 );
-            _mm_storeu_pd( pr_pt+6+2*i, out2 );
-          }
-        }
-      }
-      
-      D_pt += 18;
-      nb_pt++;
-    }
-    
-    phi_pt += 12*2;
-  }
-  
-}
-
-
-void prn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { 
-  
-  float *phi_pt = (float*)(phi+start);
-  float *phi_end_pt = (float*)(phi+end);
-  float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])};
-  float *D_pt = (float*)(op->D_transformed_vectorized+2*(start*4));
-  int *nb_pt = neighbor+((start/12)*4);
-  
-  while ( phi_pt < phi_end_pt ) { 
-    
-    __m128 in1[2];
-    __m128 in2[2];
-
-    in1[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 );
-    in1[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 );
-    in2[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 );
-    in2[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 );
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      __m128 res1[2];
-      __m128 res2[2];
-      
-      {
-        // calc spin0 projection
-        res1[0] = _mm_setr_ps( index_re(phi_pt,mu,0), index_re(phi_pt+2,mu,0), index_re(phi_pt+4,mu,0), 0 );
-        res1[1] = _mm_setr_ps( index_im(phi_pt,mu,0), index_im(phi_pt+2,mu,0), index_im(phi_pt+4,mu,0), 0 );
-        __m128 in1_re = _mm_add_ps( in1[0], res1[0] );
-        __m128 in1_im = _mm_add_ps( in1[1], res1[1] );
-        
-        // calc spin1 projection
-        res1[0] = _mm_setr_ps( index_re(phi_pt,mu,1), index_re(phi_pt+2,mu,1), index_re(phi_pt+4,mu,1), 0 );
-        res1[1] = _mm_setr_ps( index_im(phi_pt,mu,1), index_im(phi_pt+2,mu,1), index_im(phi_pt+4,mu,1), 0 );   
-        __m128 in2_re = _mm_add_ps( in2[0], res1[0] );
-        __m128 in2_im = _mm_add_ps( in2[1], res1[1] );
-        
-        // load 1st part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt );
-          __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(0,0,0,0) );
-            cmul_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(0,0,0,0) );
-            cmul_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-        // load 2nd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(1,1,1,1) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(1,1,1,1) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-        // load 3rd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(2,2,2,2) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(2,2,2,2) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-      }
-      
-      {
-        __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] );
-        __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] );
-        __m128 buf3 = _mm_unpacklo_ps( res2[0], res2[1] );
-        
-        {
-          __m128 buf4 = _mm_unpackhi_ps( res2[0], res2[1] );
-          buf2 = _mm_movelh_ps( buf2, buf3 );
-          buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        }
-        {
-          float *pr_pt = pr[mu]+2*6*(*nb_pt);
-          _mm_storeu_ps( pr_pt, buf1 );
-          _mm_storeu_ps( pr_pt+4, buf2 );
-          _mm_storeu_ps( pr_pt+8, buf3 );
-        }
-      }
-      nb_pt++;
-      D_pt += 24;
-    }
-    
-    phi_pt += 24;
-  }
-}
-
-
-void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) {
-  
-  double *phi_pt = (double*)(phi+start);
-  double *phi_end_pt = (double*)(phi+end);
-  double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
-  double *D_pt = ((double*)(op->D))+2*(start/24*36);
-  int *nb_pt = neighbor+((start/24)*4);
-  
-  while ( phi_pt < phi_end_pt ) {
-    
-    __m128d in_re[6];
-    __m128d in_im[6];
-    
-    for ( int i=0; i<3; i++ ) {
-      in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] );
-      in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); 
-    }
-    for ( int i=3; i<6; i++ ) {
-      in_re[i] = _mm_setr_pd( phi_pt[2*i+6], phi_pt[2*i+12] );
-      in_im[i] = _mm_setr_pd( phi_pt[2*i+7], phi_pt[2*i+13] ); 
-    }
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      
-      __m128d v_re[6];
-      __m128d v_im[6];
-      
-      // calc spin projection
-      for ( int i=0; i<3; i++ )  {
-        v_re[i] = _mm_setr_pd( index_d_re(phi_pt+2*i,mu,0), index_d_re(phi_pt+2*i,mu,1) );
-        v_im[i] = _mm_setr_pd( index_d_im(phi_pt+2*i,mu,0), index_d_im(phi_pt+2*i,mu,1) );
-        v_re[i] = _mm_add_pd( in_re[i], v_re[i] );
-        v_im[i] = _mm_add_pd( in_im[i], v_im[i] );
-      }
-      for ( int i=3; i<6; i++ )  {
-        v_re[i] = _mm_setr_pd( index_d_re(phi_pt+6+2*i,mu,0), index_d_re(phi_pt+6+2*i,mu,1) );
-        v_im[i] = _mm_setr_pd( index_d_im(phi_pt+6+2*i,mu,0), index_d_im(phi_pt+6+2*i,mu,1) );
-        v_re[i] = _mm_add_pd( in_re[i], v_re[i] );
-        v_im[i] = _mm_add_pd( in_im[i], v_im[i] );
-      }
-
-      {
-        __m128d res_re[6];
-        __m128d res_im[6];
-        // load su(3) matrix and multiply
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] );
-          cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          cmul_conj_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[6+2*i] );
-          buf_im = _mm_set1_pd( D_pt[7+2*i] );
-          cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          cfmadd_conj_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[12+2*i] );
-          buf_im = _mm_set1_pd( D_pt[13+2*i] );
-          cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-          cfmadd_conj_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
-        }
-        
-        {
-          double *pr_pt = pr[mu]+2*12*(*(nb_pt));
-          for ( int i=0; i<3; i++ ) {
-            __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] );
-            __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] );
-            _mm_storeu_pd( pr_pt+0+2*i, out1 );
-            _mm_storeu_pd( pr_pt+6+2*i, out2 );
-          }
-          for ( int i=3; i<6; i++ ) {
-            __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] );
-            __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] );
-            _mm_storeu_pd( pr_pt+ 6+2*i, out1 );
-            _mm_storeu_pd( pr_pt+12+2*i, out2 );
-          }
-        }
-      }
-      
-      D_pt += 18;
-      nb_pt++;
-    }
-    
-    phi_pt += 24*2;
-  }
-  
-}
-
-
-void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { 
-  
-  float *phi_pt = (float*)(phi+start);
-  float *phi_end_pt = (float*)(phi+end);
-  float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])};
-  float *D_pt = (float*)(op->D_transformed_vectorized+2*(start/24*48));
-  int *nb_pt = neighbor+((start/24)*4);
-  
-  while ( phi_pt < phi_end_pt ) { 
-    
-    __m128 in11[2];
-    __m128 in21[2];
-    __m128 in12[2];
-    __m128 in22[2];
-
-    in11[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 );
-    in11[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 );
-    in21[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 );
-    in21[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 );
-    in12[0] = _mm_setr_ps( phi_pt[12], phi_pt[14], phi_pt[16], 0 );
-    in12[1] = _mm_setr_ps( phi_pt[13], phi_pt[15], phi_pt[17], 0 );
-    in22[0] = _mm_setr_ps( phi_pt[18], phi_pt[20], phi_pt[22], 0 );
-    in22[1] = _mm_setr_ps( phi_pt[19], phi_pt[21], phi_pt[23], 0 );
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      __m128 res11[2];
-      __m128 res21[2];
-      __m128 res12[2];
-      __m128 res22[2];
-      
-      {
-        // calc spin0 projection
-        res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), index_d_re(phi_pt+4,mu,0), 0 );
-        res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), index_d_im(phi_pt+4,mu,0), 0 );
-        __m128 in11_re = _mm_add_ps( in11[0], res11[0] );
-        __m128 in11_im = _mm_add_ps( in11[1], res11[1] );
-        
-        // calc spin1 projection
-        res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,1), index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), 0 );
-        res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,1), index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), 0 );   
-        __m128 in21_re = _mm_add_ps( in21[0], res11[0] );
-        __m128 in21_im = _mm_add_ps( in21[1], res11[1] );
-
-        // calc spin0 projection
-        res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0), index_d_re(phi_pt+16,mu,0), 0 );
-        res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0), index_d_im(phi_pt+16,mu,0), 0 );
-        __m128 in12_re = _mm_add_ps( in12[0], res12[0] );
-        __m128 in12_im = _mm_add_ps( in12[1], res12[1] );
-        
-        // calc spin1 projection
-        res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,1), index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1), 0 );
-        res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,1), index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1), 0 );   
-        __m128 in22_re = _mm_add_ps( in22[0], res12[0] );
-        __m128 in22_im = _mm_add_ps( in22[1], res12[1] );
-
-        // load 1st part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt );
-          __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) );
-            cmul_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) );
-            cmul_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) );
-            cmul_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) );
-            cmul_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
-          }
-        }
-        // load 2nd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
-          }
-        }
-        // load 3rd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) );
-            cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
-          }
-        }
-      }
-
-      float *pr_pt = pr[mu]+2*12*(*nb_pt);
-      {
-        __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] );
-        __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] );
-        __m128 buf3 = _mm_unpacklo_ps( res21[0], res21[1] );
-          
-        {
-          __m128 buf4 = _mm_unpackhi_ps( res21[0], res21[1] );
-          buf2 = _mm_movelh_ps( buf2, buf3 );
-          buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        }
-        {
-          _mm_storeu_ps( pr_pt, buf1 );
-          _mm_storeu_ps( pr_pt+4, buf2 );
-          _mm_storeu_ps( pr_pt+8, buf3 );
-        }
-      }
-      {
-        __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] );
-        __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] );
-        __m128 buf3 = _mm_unpacklo_ps( res22[0], res22[1] );
-        
-        {
-          __m128 buf4 = _mm_unpackhi_ps( res22[0], res22[1] );
-          buf2 = _mm_movelh_ps( buf2, buf3 );
-          buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        }
-        {
-          _mm_storeu_ps( pr_pt+12, buf1 );
-          _mm_storeu_ps( pr_pt+16, buf2 );
-          _mm_storeu_ps( pr_pt+20, buf3 );
-        }
-      }
-      nb_pt++;
-      D_pt += 24;
-    }
-    
-    phi_pt += 48;
-  }
-}
-
-
-void pbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) {
-  
-  double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
-  double *eta_pt = (double*)(eta+start);
-  
-  __m128d gamma0[4];
-  __m128d gamma1[4];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][0]], gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][1]], gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  for ( int i=start; i<end; i+=12 ) {
-    
-    __m128d res[12];
-    for ( int j=0; j<12; j++ ) {
-      res[j] = _mm_loadu_pd( eta_pt + 2*j );
-    }
-    
-    __m128d in[6];
-    // mu = T 
-    for ( int j=0; j<6; j++ ) {
-      in[j] = _mm_loadu_pd( pr[T] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[j]) );
-      res[3*gamma_co[T][0]+j] = _mm_sub_pd( res[3*gamma_co[T][0]+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[3+j]) );
-      res[3*gamma_co[T][1]+j] = _mm_sub_pd( res[3*gamma_co[T][1]+j], buf1 );
-    }
-    // ---------------
-    // mu = Z  
-    for ( int j=0; j<6; j++ ) {
-      in[j] = _mm_loadu_pd( pr[Z] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[j]) );
-      res[3*gamma_co[Z][0]+j] = _mm_sub_pd( res[3*gamma_co[Z][0]+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+j]) );
-      res[3*gamma_co[Z][1]+j] = _mm_sub_pd( res[3*gamma_co[Z][1]+j], buf1 );
-    }
-    // ---------------
-    // mu = Y
-    for ( int j=0; j<6; j++ ) {
-      in[j] = _mm_loadu_pd( pr[Y] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[j]) );
-      res[3*gamma_co[Y][0]+j] = _mm_sub_pd( res[3*gamma_co[Y][0]+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+j]) );
-      res[3*gamma_co[Y][1]+j] = _mm_sub_pd( res[3*gamma_co[Y][1]+j], buf1 );
-    }
-    // ---------------
-    // mu = X
-    for ( int j=0; j<6; j++ ) {
-      in[j] = _mm_loadu_pd( pr[X] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[j]) );
-      res[3*gamma_co[X][0]+j] = _mm_sub_pd( res[3*gamma_co[X][0]+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+j]) );
-      res[3*gamma_co[X][1]+j] = _mm_sub_pd( res[3*gamma_co[X][1]+j], buf1 );
-    }
-    // ---------------
-    
-    for ( int j=0; j<12; j++ ) {
-      _mm_storeu_pd( eta_pt + 2*j, res[j] );
-    }
-    eta_pt += 24;
-  }
-}
-
-
-void pbn_float( complex_float *eta, complex_float *prp[4], int start, int end ) {
-  
-  float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])};
-  float *eta_pt = (float*)(eta+start);
-  
-  __m128 gamma0[4][2];
-  __m128 gamma1[4][2];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu][0] = _mm_set1_ps( gamma_re_sign[mu][gamma_co[mu][0]] );
-    gamma0[mu][1] = _mm_set1_ps( gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu][0] = _mm_set1_ps( gamma_re_sign[mu][gamma_co[mu][1]] );
-    gamma1[mu][1] = _mm_set1_ps( gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  for ( int i=start; i<end; i+=12 ) {
-    
-    __m128 eta_lo1 = _mm_loadu_ps( eta_pt );
-    __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 );
-    __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 );
-    __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 );
-    
-    __m128 eta2_lo[2];
-    __m128 eta2_hi[2];
-    
-    eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 );
-    eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 );
-    eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 );
-    eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 );
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      __m128 res1[2];
-      __m128 res2[2];
-      
-      res1[0] = _mm_setr_ps( *(pr[mu]+i), *(pr[mu]+i+2), *(pr[mu]+i+4), 0 );
-      res1[1] = _mm_setr_ps( *(pr[mu]+i+1), *(pr[mu]+i+3), *(pr[mu]+i+5), 0 );
-      
-      res2[0] = _mm_setr_ps( *(pr[mu]+i+6), *(pr[mu]+i+8), *(pr[mu]+i+10), 0 );
-      res2[1] = _mm_setr_ps( *(pr[mu]+i+7), *(pr[mu]+i+9), *(pr[mu]+i+11), 0 );
-      
-      {
-        // store spin0 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] );
-          eta_lo1 = _mm_sub_ps( eta_lo1, buf1 );
-          eta_lo2 = _mm_sub_ps( eta_lo2, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] );
-        __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] );
-        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 );
-        eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 );
-      }
-      
-      {
-        // store spin1 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] );
-          eta_hi1 = _mm_sub_ps( eta_hi1, buf1 );
-          eta_hi2 = _mm_sub_ps( eta_hi2, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] );
-        __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] );
-        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 );
-        eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 );
-      }
-    }
-    
-    _mm_storeu_ps( eta_pt, eta_lo1 );
-    _mm_storeu_ps( eta_pt+4, eta_lo2 );
-    _mm_storeu_ps( eta_pt+6, eta_hi1 );
-    _mm_storeu_ps( eta_pt+10, eta_hi2 );
-    _mm_storeu_ps( eta_pt+12, eta2_lo[0] );
-    _mm_storeu_ps( eta_pt+14, eta2_hi[0] );
-    _mm_storeu_ps( eta_pt+18, eta2_lo[1] );
-    _mm_storeu_ps( eta_pt+20, eta2_hi[1] );
-    
-    eta_pt += 24;
-  }
-}
-
-void dpbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) {
-  
-  double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
-  double *eta_pt = (double*)(eta+start);
-  
-  __m128d gamma0[4];
-  __m128d gamma1[4];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][0]], gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][1]], gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  for ( int i=start; i<end; i+=24 ) {
-    
-    __m128d res[24];
-    for ( int j=0; j<24; j++ ) {
-      res[j] = _mm_loadu_pd( eta_pt + 2*j );
-    }
-
-    __m128d in[12];
-    // mu = T 
-    for ( int j=0; j<12; j++ ) {
-      in[j] = _mm_loadu_pd( pr[T] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[j]) );
-      res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[j+3]) );
-      res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[j+6]) );
-      res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[j+9]) );
-      res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+j], buf1 );
-    }
-    // ---------------
-    // mu = Z 
-    for ( int j=0; j<12; j++ ) {
-      in[j] = _mm_loadu_pd( pr[Z] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[j]) );
-      res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[j+3]) );
-      res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[j+6]) );
-      res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[j+9]) );
-      res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+j], buf1 );
-    }
-    // ---------------
-    // mu = Y 
-    for ( int j=0; j<12; j++ ) {
-      in[j] = _mm_loadu_pd( pr[Y] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[j]) );
-      res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[j+3]) );
-      res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[j+6]) );
-      res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[j+9]) );
-      res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+j], buf1 );
-    }
-    // ---------------
-    // mu = X 
-    for ( int j=0; j<12; j++ ) {
-      in[j] = _mm_loadu_pd( pr[X] + i + 2*j );
-      res[j] = _mm_sub_pd( res[j], in[j] );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[j]) );
-      res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[j+3]) );
-      res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[j+6]) );
-      res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+j], buf1 );
-    }
-    for ( int j=0; j<3; j++ ) {
-      __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[j+9]) );
-      res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+j], buf1 );
-    }
-    // ---------------
-    
-    for ( int j=0; j<24; j++ ) {
-      _mm_storeu_pd( eta_pt + 2*j, res[j] );
-    }
-    eta_pt += 48;
-  }
-}
-
- 
-void dpbn_float( complex_float *eta, complex_float *prp[4], int start, int end ) {
-  
-  float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])};
-  float *eta_pt = (float*)(eta+start);
-  
-  __m128 gamma0[4][2];
-  __m128 gamma1[4][2];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu][0] = _mm_set1_ps( gamma_re_sign[mu][gamma_co[mu][0]] );
-    gamma0[mu][1] = _mm_set1_ps( gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu][0] = _mm_set1_ps( gamma_re_sign[mu][gamma_co[mu][1]] );
-    gamma1[mu][1] = _mm_set1_ps( gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  for ( int i=start; i<end; i+=24 ) {
-    
-    __m128 eta_lo11 = _mm_loadu_ps( eta_pt + 0 );
-    __m128 eta_lo21 = _mm_loadu_ps( eta_pt + 4 );
-    __m128 eta_hi11 = _mm_loadu_ps( eta_pt + 6 );
-    __m128 eta_hi21 = _mm_loadu_ps( eta_pt + 10 );
-    __m128 eta_lo12 = _mm_loadu_ps( eta_pt + 12 );
-    __m128 eta_lo22 = _mm_loadu_ps( eta_pt + 16 );
-    __m128 eta_hi12 = _mm_loadu_ps( eta_pt + 18 );
-    __m128 eta_hi22 = _mm_loadu_ps( eta_pt + 22 );
-    
-    __m128 eta21_lo[2];
-    __m128 eta21_hi[2];
-    __m128 eta22_lo[2];
-    __m128 eta22_hi[2];
-    
-    eta21_lo[0] = _mm_loadu_ps( eta_pt + 24 );
-    eta21_hi[0] = _mm_loadu_ps( eta_pt + 26 );
-    eta21_lo[1] = _mm_loadu_ps( eta_pt + 30 );
-    eta21_hi[1] = _mm_loadu_ps( eta_pt + 32 ); 
-    eta22_lo[0] = _mm_loadu_ps( eta_pt + 36 );
-    eta22_hi[0] = _mm_loadu_ps( eta_pt + 38 );
-    eta22_lo[1] = _mm_loadu_ps( eta_pt + 42 );
-    eta22_hi[1] = _mm_loadu_ps( eta_pt + 44 );
-   
-    for ( int mu=0; mu<4; mu++ ) {
-      __m128 res11[2];
-      __m128 res21[2];
-      __m128 res12[2];
-      __m128 res22[2];
-      
-      res11[0] = _mm_setr_ps( *(pr[mu]+i+0), *(pr[mu]+i+2), *(pr[mu]+i+4), 0 );
-      res11[1] = _mm_setr_ps( *(pr[mu]+i+1), *(pr[mu]+i+3), *(pr[mu]+i+5), 0 );
-      
-      res21[0] = _mm_setr_ps( *(pr[mu]+i+6), *(pr[mu]+i+8), *(pr[mu]+i+10), 0 );
-      res21[1] = _mm_setr_ps( *(pr[mu]+i+7), *(pr[mu]+i+9), *(pr[mu]+i+11), 0 );
-
-      res12[0] = _mm_setr_ps( *(pr[mu]+i+12), *(pr[mu]+i+14), *(pr[mu]+i+16), 0 );
-      res12[1] = _mm_setr_ps( *(pr[mu]+i+13), *(pr[mu]+i+15), *(pr[mu]+i+17), 0 );
-      
-      res22[0] = _mm_setr_ps( *(pr[mu]+i+18), *(pr[mu]+i+20), *(pr[mu]+i+22), 0 );
-      res22[1] = _mm_setr_ps( *(pr[mu]+i+19), *(pr[mu]+i+21), *(pr[mu]+i+23), 0 );
-
-      {
-        // store spin0 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] );
-          eta_lo11 = _mm_sub_ps( eta_lo11, buf1 );
-          eta_lo21 = _mm_sub_ps( eta_lo21, buf2 );
-        }
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] );
-          eta_lo12 = _mm_sub_ps( eta_lo12, buf1 );
-          eta_lo22 = _mm_sub_ps( eta_lo22, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        {
-          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res11[gamma_offset[mu][0]] );
-          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res11[1-gamma_offset[mu][0]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta21_lo[gamma_co[mu][2]] = _mm_sub_ps( eta21_lo[gamma_co[mu][2]], buf3 );
-          eta21_hi[gamma_co[mu][2]] = _mm_sub_ps( eta21_hi[gamma_co[mu][2]], buf4 );
-        }
-        {
-          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res12[gamma_offset[mu][0]] );
-          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res12[1-gamma_offset[mu][0]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta22_lo[gamma_co[mu][2]] = _mm_sub_ps( eta22_lo[gamma_co[mu][2]], buf3 );
-          eta22_hi[gamma_co[mu][2]] = _mm_sub_ps( eta22_hi[gamma_co[mu][2]], buf4 );
-        }
-      }
-      {
-        // store spin1 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res21[0], res21[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res21[0], res21[1] );
-          eta_hi11 = _mm_sub_ps( eta_hi11, buf1 );
-          eta_hi21 = _mm_sub_ps( eta_hi21, buf2 );
-        }
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res22[0], res22[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res22[0], res22[1] );
-          eta_hi12 = _mm_sub_ps( eta_hi12, buf1 );
-          eta_hi22 = _mm_sub_ps( eta_hi22, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        {
-          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res21[gamma_offset[mu][1]] );
-          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res21[1-gamma_offset[mu][1]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta21_lo[gamma_co[mu][3]] = _mm_sub_ps( eta21_lo[gamma_co[mu][3]], buf3 );
-          eta21_hi[gamma_co[mu][3]] = _mm_sub_ps( eta21_hi[gamma_co[mu][3]], buf4 );
-        }
-        {
-          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res22[gamma_offset[mu][1]] );
-          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res22[1-gamma_offset[mu][1]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta22_lo[gamma_co[mu][3]] = _mm_sub_ps( eta22_lo[gamma_co[mu][3]], buf3 );
-          eta22_hi[gamma_co[mu][3]] = _mm_sub_ps( eta22_hi[gamma_co[mu][3]], buf4 );
-        }
-      }
-    }
-    
-    _mm_storeu_ps( eta_pt+0, eta_lo11 );
-    _mm_storeu_ps( eta_pt+4, eta_lo21 );
-    _mm_storeu_ps( eta_pt+6, eta_hi11 );
-    _mm_storeu_ps( eta_pt+10, eta_hi21 );
-    _mm_storeu_ps( eta_pt+12, eta_lo12 );
-    _mm_storeu_ps( eta_pt+16, eta_lo22 );
-    _mm_storeu_ps( eta_pt+18, eta_hi12 );
-    _mm_storeu_ps( eta_pt+22, eta_hi22 );
-    _mm_storeu_ps( eta_pt+24, eta21_lo[0] );
-    _mm_storeu_ps( eta_pt+26, eta21_hi[0] );
-    _mm_storeu_ps( eta_pt+30, eta21_lo[1] );
-    _mm_storeu_ps( eta_pt+32, eta21_hi[1] );
-    _mm_storeu_ps( eta_pt+36, eta22_lo[0] );
-    _mm_storeu_ps( eta_pt+38, eta22_hi[0] );
-    _mm_storeu_ps( eta_pt+42, eta22_lo[1] );
-    _mm_storeu_ps( eta_pt+44, eta22_hi[1] );
-    
-    eta_pt += 48;
-  }
-}
-
-
-void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op,
-                     int *neighbor, int start, int end ) {  
-  
-  double *D_pt = ((double*)(op->D))+2*(start*3);
-  double *eta_pt = (double*)(eta+start);
-  double *eta_end_pt = (double*)(eta+end);
-  double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])};
-  int *nb_pt = neighbor+((start/12)*4);
-  
-  __m128d gamma0[4];
-  __m128d gamma1[4];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  while( eta_pt < eta_end_pt ) {
-    
-    __m128d res[12];
-    for ( int i=0; i<12; i++ ) {
-      res[i] = _mm_loadu_pd( eta_pt + 2*i );
-    }
-    
-    // ---------------
-    // mu = T
-    {
-      __m128d res_re[3];
-      __m128d res_im[3];
-      {
-        __m128d v_re[3];
-        __m128d v_im[3];
-        int j = 2*6*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[6];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        for ( int i=0; i<6; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) );
-          res[3*gamma_co[T][0]+i] = _mm_sub_pd( res[3*gamma_co[T][0]+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[T][1]+i] = _mm_sub_pd( res[3*gamma_co[T][1]+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    // mu = Z
-    {
-      __m128d res_re[3];
-      __m128d res_im[3];
-      {
-        __m128d v_re[3];
-        __m128d v_im[3];
-        int j = 2*6*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[6];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        for ( int i=0; i<6; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) );
-          res[3*gamma_co[Z][0]+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    // mu = Y
-    {
-      __m128d res_re[3];
-      __m128d res_im[3];
-      {
-        __m128d v_re[3];
-        __m128d v_im[3];
-        int j = 2*6*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[6];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        for ( int i=0; i<6; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) );
-          res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 );
-        }
-      }
-    }
-    // ---------------  
-    // mu = X
-    {
-      __m128d res_re[3];
-      __m128d res_im[3];
-      {
-        __m128d v_re[3];
-        __m128d v_im[3];
-        int j = 2*6*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[6];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        for ( int i=0; i<6; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) );
-          res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    
-    for ( int i=0; i<12; i++ ) {
-      _mm_storeu_pd( eta_pt + 2*i, res[i] );
-    }
-    eta_pt+=24;
-  }
-  
-}
-
-
-void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op,
-                     int *neighbor, int start, int end ) {
-  
-  float *D_pt = (float*)(op->D_vectorized+2*(start*4));
-  float *eta_pt = (float*)(eta+start);
-  float *eta_end_pt = (float*)(eta+end);
-  float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])};
-  int *nb_pt = neighbor+((start/12)*4);
-  
-  __m128 gamma0[4][2];
-  __m128 gamma1[4][2];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] );
-    gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] );
-    gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  while( eta_pt < eta_end_pt ) {
-    
-    __m128 eta_lo1 = _mm_loadu_ps( eta_pt );
-    __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 );
-    __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 );
-    __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 );
-    
-    __m128 eta2_lo[2];
-    __m128 eta2_hi[2];
-    
-    eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 );
-    eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 );
-    eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 );
-    eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 );
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      __m128 res1[2];
-      __m128 res2[2];
-      
-      {
-        int j = 2*6*(*nb_pt);
-        // load 1st part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt );
-          __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) );
-            cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) );
-            cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-        // load 2nd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) );
-            cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) );
-            cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-        // load 3rd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) );
-            cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) );
-            cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-      }
-            
-      {
-        // store spin0 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] );
-          eta_lo1 = _mm_sub_ps( eta_lo1, buf1 );
-          eta_lo2 = _mm_sub_ps( eta_lo2, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] );
-        __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] );
-        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 );
-        eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 );
-      }
-      
-      {
-        // store spin1 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] );
-          eta_hi1 = _mm_sub_ps( eta_hi1, buf1 );
-          eta_hi2 = _mm_sub_ps( eta_hi2, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] );
-        __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] );
-        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 );
-        eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 );
-      }
-      
-      nb_pt++;
-      D_pt += 24;
-    }
-    
-    _mm_storeu_ps( eta_pt, eta_lo1 );
-    _mm_storeu_ps( eta_pt+4, eta_lo2 );
-    _mm_storeu_ps( eta_pt+6, eta_hi1 );
-    _mm_storeu_ps( eta_pt+10, eta_hi2 );
-    _mm_storeu_ps( eta_pt+12, eta2_lo[0] );
-    _mm_storeu_ps( eta_pt+14, eta2_hi[0] );
-    _mm_storeu_ps( eta_pt+18, eta2_lo[1] );
-    _mm_storeu_ps( eta_pt+20, eta2_hi[1] );
-  
-    eta_pt += 24;
-  }
-  
-}
-
-
-void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op,
-                     int *neighbor, int start, int end ) {  
-  
-  double *D_pt = ((double*)(op->D))+2*(start/24*36);
-  double *eta_pt = (double*)(eta+start);
-  double *eta_end_pt = (double*)(eta+end);
-  double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])};
-  int *nb_pt = neighbor+((start/24)*4);
-  
-  __m128d gamma0[4];
-  __m128d gamma1[4];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  while( eta_pt < eta_end_pt ) {
-    
-    __m128d res[24];
-    for ( int i=0; i<24; i++ ) {
-      res[i] = _mm_loadu_pd( eta_pt + 2*i );
-    }
-    
-    // ---------------
-    // mu = T
-    {
-      __m128d res_re[6];
-      __m128d res_im[6];
-      {
-        __m128d v_re[6];
-        __m128d v_im[6];
-        int j = 2*12*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) );
-        }
-        for ( int i=3; i<6; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[T]+j+6+2*i), *(pr[T]+j+12+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[T]+j+7+2*i), *(pr[T]+j+13+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[12];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
-        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
-        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
-        
-        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
-        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
-        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
-        
-        for ( int i=0; i<12; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) );
-          res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+3]) );
-          res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i+6]) );
-          res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+9]) );
-          res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    // mu = Z
-    {
-      __m128d res_re[6];
-      __m128d res_im[6];
-      {
-        __m128d v_re[6];
-        __m128d v_im[6];
-        int j = 2*12*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) );
-        }
-        for ( int i=3; i<6; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[Z]+j+6+2*i), *(pr[Z]+j+12+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[Z]+j+7+2*i), *(pr[Z]+j+13+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[12];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
-        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
-        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
-        
-        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
-        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
-        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
-        
-        for ( int i=0; i<12; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) );
-          res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+3]) );
-          res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i+6]) );
-          res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+9]) );
-          res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    // mu = Y
-    {
-      __m128d res_re[6];
-      __m128d res_im[6];
-      {
-        __m128d v_re[6];
-        __m128d v_im[6];
-        int j = 2*12*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) );
-        }
-        for ( int i=3; i<6; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[Y]+j+6+2*i), *(pr[Y]+j+12+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[Y]+j+7+2*i), *(pr[Y]+j+13+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[12];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
-        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
-        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
-        
-        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
-        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
-        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
-        
-        for ( int i=0; i<12; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) );
-          res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+3]) );
-          res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i+6]) );
-          res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+9]) );
-          res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    // mu = X
-    {
-      __m128d res_re[6];
-      __m128d res_im[6];
-      {
-        __m128d v_re[6];
-        __m128d v_im[6];
-        int j = 2*12*(*nb_pt);
-        
-        for ( int i=0; i<3; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) );
-        }
-        for ( int i=3; i<6; i++ )  {
-          v_re[i] = _mm_setr_pd( *(pr[X]+j+6+2*i), *(pr[X]+j+12+2*i) );
-          v_im[i] = _mm_setr_pd( *(pr[X]+j+7+2*i), *(pr[X]+j+13+2*i) );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
-          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
-          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
-          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[2+6*i] );
-          buf_im = _mm_set1_pd( D_pt[3+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
-          buf_re = _mm_set1_pd( D_pt[4+6*i] );
-          buf_im = _mm_set1_pd( D_pt[5+6*i] );
-          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
-          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
-        }
-        D_pt += 18;
-        nb_pt++;
-      }
-      {
-        __m128d in[12];
-        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
-        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
-        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
-        
-        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
-        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
-        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
-        
-        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
-        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
-        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
-        
-        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
-        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
-        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
-        
-        for ( int i=0; i<12; i++ ) {
-          res[i] = _mm_sub_pd( res[i], in[i] );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) );
-          res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+3]) );
-          res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i+6]) );
-          res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i], buf1 );
-        }
-        for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+9]) );
-          res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i], buf1 );
-        }
-      }
-    }
-    // ---------------
-    
-    for ( int i=0; i<24; i++ ) {
-      _mm_storeu_pd( eta_pt + 2*i, res[i] );
-    }
-    eta_pt+=48;
-  }
-  
-}
-
-
-void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op,
-                     int *neighbor, int start, int end ) {
-  
-  float *D_pt = (float*)(op->D_vectorized+2*(start/24*48));
-  float *eta_pt = (float*)(eta+start);
-  float *eta_end_pt = (float*)(eta+end);
-  float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])};
-  int *nb_pt = neighbor+((start/24)*4);
-  
-  __m128 gamma0[4][2];
-  __m128 gamma1[4][2];
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] );
-    gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] );
-    gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] );
-    gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] );
-  }
-  
-  while( eta_pt < eta_end_pt ) {
-    
-    __m128 eta_lo1 = _mm_loadu_ps( eta_pt );
-    __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 );
-    __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 );
-    __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 );
-    __m128 eta_lo3 = _mm_loadu_ps( eta_pt + 12 );
-    __m128 eta_lo4 = _mm_loadu_ps( eta_pt + 16 );
-    __m128 eta_hi3 = _mm_loadu_ps( eta_pt + 18 );
-    __m128 eta_hi4 = _mm_loadu_ps( eta_pt + 22 );
-    
-    __m128 eta2_lo[4];
-    __m128 eta2_hi[4];
-    
-    eta2_lo[0] = _mm_loadu_ps( eta_pt + 24 );
-    eta2_hi[0] = _mm_loadu_ps( eta_pt + 26 );
-    eta2_lo[1] = _mm_loadu_ps( eta_pt + 30 );
-    eta2_hi[1] = _mm_loadu_ps( eta_pt + 32 );
-    eta2_lo[2] = _mm_loadu_ps( eta_pt + 36 );
-    eta2_hi[2] = _mm_loadu_ps( eta_pt + 38 );
-    eta2_lo[3] = _mm_loadu_ps( eta_pt + 42 );
-    eta2_hi[3] = _mm_loadu_ps( eta_pt + 44 );
-    
-    for ( int mu=0; mu<4; mu++ ) {
-      __m128 res1[4];
-      __m128 res2[4];
-      
-      {
-        int j = 2*12*(*nb_pt);
-        // load 1st part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt );
-          __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) );
-            cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) );
-            cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+12) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+13) );
-            cmul( buf1, buf2, buf3, buf4, &res1[2], &res1[3] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+18) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+19) );
-            cmul( buf1, buf2, buf3, buf4, &res2[2], &res2[3] );
-          }
-        }
-        // load 2nd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) );
-            cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) );
-            cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+14) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+15) );
-            cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+20) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+21) );
-            cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] );
-          }
-        }
-        // load 3rd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) );
-            cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) );
-            cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+16) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+17) );
-            cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] );
-          }
-          {
-            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+22) );
-            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+23) );
-            cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] );
-          }
-        }
-      }
-            
-      {
-        // store spin0 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] );
-          eta_lo1 = _mm_sub_ps( eta_lo1, buf1 );
-          eta_lo2 = _mm_sub_ps( eta_lo2, buf2 );
-        }
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res1[2], res1[3] );
-          __m128 buf2 = _mm_unpackhi_ps( res1[2], res1[3] );
-          eta_lo3 = _mm_sub_ps( eta_lo3, buf1 );
-          eta_lo4 = _mm_sub_ps( eta_lo4, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        {
-          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] );
-          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 );
-          eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 );
-        }
-        {
-          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[2+gamma_offset[mu][0]] );
-          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[3-gamma_offset[mu][0]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta2_lo[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][2]], buf3 );
-          eta2_hi[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][2]], buf4 );
-        }
-      }
-      {
-        // store spin1 contribution
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] );
-          __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] );
-          eta_hi1 = _mm_sub_ps( eta_hi1, buf1 );
-          eta_hi2 = _mm_sub_ps( eta_hi2, buf2 );
-        }
-        {
-          __m128 buf1 = _mm_unpacklo_ps( res2[2], res2[3] );
-          __m128 buf2 = _mm_unpackhi_ps( res2[2], res2[3] );
-          eta_hi3 = _mm_sub_ps( eta_hi3, buf1 );
-          eta_hi4 = _mm_sub_ps( eta_hi4, buf2 );
-        }
-        
-        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        {
-          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] );
-          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 );
-          eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 );
-        }
-        {
-          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[2+gamma_offset[mu][1]] );
-          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[3-gamma_offset[mu][1]] );
-          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          eta2_lo[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][3]], buf3 );
-          eta2_hi[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][3]], buf4 );
-        }
-      }
-      nb_pt++;
-      D_pt += 24;
-    }
-    
-    _mm_storeu_ps( eta_pt, eta_lo1 );
-    _mm_storeu_ps( eta_pt+4, eta_lo2 );
-    _mm_storeu_ps( eta_pt+6, eta_hi1 );
-    _mm_storeu_ps( eta_pt+10, eta_hi2 );
-    _mm_storeu_ps( eta_pt+12, eta_lo3 );
-    _mm_storeu_ps( eta_pt+16, eta_lo4 );
-    _mm_storeu_ps( eta_pt+18, eta_hi3 );
-    _mm_storeu_ps( eta_pt+22, eta_hi4 );
-    _mm_storeu_ps( eta_pt+24, eta2_lo[0] );
-    _mm_storeu_ps( eta_pt+26, eta2_hi[0] );
-    _mm_storeu_ps( eta_pt+30, eta2_lo[1] );
-    _mm_storeu_ps( eta_pt+32, eta2_hi[1] );
-    _mm_storeu_ps( eta_pt+36, eta2_lo[2] );
-    _mm_storeu_ps( eta_pt+38, eta2_hi[2] );
-    _mm_storeu_ps( eta_pt+42, eta2_lo[3] );
-    _mm_storeu_ps( eta_pt+44, eta2_hi[3] );
-  
-    eta_pt += 48;
-  }
-  
-}
-
-
-void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void block_oddeven_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 0
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_pZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 1
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_pY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 2
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_pX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 3
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_plus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: block_oddeven_pT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: block_oddeven_pZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: block_oddeven_pY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: block_oddeven_pX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("block_oddeven_plus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void block_oddeven_nplus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void block_oddeven_npT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 0
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_npZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 1
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_npY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 2
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_npX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 3
-#include neighbor_coupling_file
-#undef MU
-#undef UPD
-}
-void block_oddeven_nplus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: block_oddeven_npT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: block_oddeven_npZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: block_oddeven_npY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: block_oddeven_npX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("block_oddeven_nplus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void block_oddeven_minus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void block_oddeven_mT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 0
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_mZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 1
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_mY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 2
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_mX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 3
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_minus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: block_oddeven_mT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: block_oddeven_mZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: block_oddeven_mY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: block_oddeven_mX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("block_oddeven_minus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void block_oddeven_nminus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void block_oddeven_nmT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 0
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_nmZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 1
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_nmY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 2
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_nmX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 3
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef MU
-#undef UPD
-}
-void block_oddeven_nminus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: block_oddeven_nmT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: block_oddeven_nmZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: block_oddeven_nmY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: block_oddeven_nmX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("block_oddeven_nminus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void boundary_nminus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void boundary_nmT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 0
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_nmZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 1
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_nmY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 2
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_nmX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 3
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_nminus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: boundary_nmT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: boundary_nmZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: boundary_nmY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: boundary_nmX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("boundary_nminus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void boundary_nplus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void boundary_npT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 0
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_npZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 1
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_npY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 2
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_npX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_add_ps
-#define MU 3
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_nplus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: boundary_npT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: boundary_npZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: boundary_npY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: boundary_npX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("boundary_nplus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void boundary_minus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void boundary_mT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 0
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_mZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 1
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_mY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 2
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_mX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 3
-#define BOUNDARY
-#define MINUSDIR
-#include neighbor_coupling_file
-#undef MINUSDIR
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_minus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: boundary_mT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: boundary_mZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: boundary_mY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: boundary_mX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("boundary_minus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-void boundary_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
-void boundary_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 0
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_pZ_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 1
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_pY_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 2
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_pX_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
-#define UPD _mm_sub_ps
-#define MU 3
-#define BOUNDARY
-#include neighbor_coupling_file
-#undef BOUNDARY
-#undef MU
-#undef UPD
-}
-void boundary_plus_coupling_float( float *eta, float *D, float *phi, int mu, int start, int end, int *ind, int *neighbor ) {
-  switch ( mu ) {
-    case T: boundary_pT_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Z: boundary_pZ_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case Y: boundary_pY_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    case X: boundary_pX_coupling_float( eta, D, phi, start, end, ind, neighbor ); break;
-    default: error0("boundary_plus_coupling_float: invalid mu=%d\n", mu );
-  }
-}
-
-
-
-
-
-static inline int sse_clover_real_index( int i, int j ) {
-  return (i/SIMD_LENGTH_float)*12*SIMD_LENGTH_float + SIMD_LENGTH_float*j*2 + i%SIMD_LENGTH_float;
-}
-
-static inline int sse_clover_imag_index( int i, int j ) {
-  return (i/SIMD_LENGTH_float)*12*SIMD_LENGTH_float + SIMD_LENGTH_float*(j*2+1) + i%SIMD_LENGTH_float;
-}
-
-void sse_set_clover_double( double *out, complex_double *in ) { }
-
-void sse_set_clover_float( float *out, complex_float *in ) {
-    
-  int index;
-  float sign = 0.0;
-  for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
-    for ( int j=0; j<6; j++ ) {
-      for ( int i=0; i<SIMD_LENGTH_float; i++ ) {
-        if ( i+k == j || i+k-6 == j ) {
-          // diagonal entry i+k,i+k
-          index = i+k;
-          sign = 1.0;
-        } else if ( i+k<6 ) {
-          // first 6-by-6 matrix
-          if ( j > i+k ) {
-            // upper triangle
-            index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1));
-            sign = 1.0;
-          } else {
-            // lower triangle, j < i+k
-            index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1));
-            sign = -1.0;
-          } 
-        } else {
-          // i+k >= 6
-          // second 6-by-6 matrix
-          if ( j > i+k-6 ) {
-            // upper triangle
-            index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1));
-            sign = 1.0;
-          } else {
-            // j < i+k-6
-            // lower triangle
-            index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1));
-            sign = -1.0;
-          }
-        }
-        out[ sse_clover_real_index(i+k,j) ] = creal_float( (complex_float)in[index] );
-        out[ sse_clover_imag_index(i+k,j) ] = sign*cimag_float( (complex_float)in[index] );
-      }
-    }
-  }
-}
-
-void sse_set_clover_doublet_double( double *out, complex_double *in ) { }
-
-void sse_set_clover_doublet_float( float *out, complex_float *in ) {
-    
-  int index, d;
-  float sign = 0.0;
-  for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
-    for ( int j=0; j<6; j++ ) {
-      for ( int i=0; i<SIMD_LENGTH_float; i++ ) {
-        if ( i+k == j || i+k-6 == j ) {
-          // diagonal entry i+k,i+k
-          index = i+k;
-          sign = 1.0;
-        } else if ( i+k<6 ) {
-          // first 6-by-6 matrix
-          if ( j > i+k ) {
-            // upper triangle
-            index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1));
-            sign = 1.0;
-          } else {
-            // lower triangle, j < i+k
-            index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1));
-            sign = -1.0;
-          } 
-        } else {
-          // i+k >= 6
-          // second 6-by-6 matrix
-          if ( j > i+k-6 ) {
-            // upper triangle
-            index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1));
-            sign = 1.0;
-          } else {
-            // j < i+k-6
-            // lower triangle
-            index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1));
-            sign = -1.0;
-          }
-        }
-        d=(i+k<6)?0:6;
-        out[ sse_clover_real_index(i+k+d,j) ] = creal_float( in[index] );
-        out[ sse_clover_imag_index(i+k+d,j) ] = sign*cimag_float( in[index] );
-        out[ sse_clover_real_index(i+k+d+6,j) ] = creal_float( in[index] );
-        out[ sse_clover_imag_index(i+k+d+6,j) ] = sign*cimag_float( in[index] );
-      }
-    }
-  }
-}
-
-void sse_add_diagonal_clover_double( double *out, complex_double *diag ) { }
-
-void sse_add_diagonal_clover_float( float *out, complex_float *diag ) {
-  for ( int k=0; k<12; k++ ) {
-    out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] );
-    out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] );
-  }
-}
-
-void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag ) { }
-
-void sse_add_diagonal_clover_doublet_float( float *out, complex_float *diag ) {
-  for ( int k=0; k<6; k++ ) {
-    out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] );
-    out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] );
-    out[ sse_clover_real_index(k+6,k%6) ] -= creal_float( diag[k] );
-    out[ sse_clover_imag_index(k+6,k%6) ] -= cimag_float( diag[k] );
-  }
-  for ( int k=6; k<12; k++ ) {
-    out[ sse_clover_real_index(k+6,k%6) ] += creal_float( diag[k] );
-    out[ sse_clover_imag_index(k+6,k%6) ] += cimag_float( diag[k] );
-    out[ sse_clover_real_index(k+12,k%6) ] -= creal_float( diag[k] );
-    out[ sse_clover_imag_index(k+12,k%6) ] -= cimag_float( diag[k] );
-  }
-}
-
-void sse_site_clover_double( double *eta, const double *phi, const double *clover ) { 
-
-}
-
-void sse_site_clover_float( float *eta, const float *phi, float *clover ) {
-  
-  __m128 in_re;
-  __m128 in_im;
-  
-  __m128 clov_re;
-  __m128 clov_im;
-  
-  __m128 out_re;
-  __m128 out_im;
-  
-#ifdef HAVE_TM1p1
-  if( g.n_flavours == 2 ) {
-    // lines 1--4; indeces from 0 to 47
-    in_re = _mm_set1_ps( phi[0] );
-    in_im = _mm_set1_ps( phi[1] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i] );
-      in_im = _mm_set1_ps( phi[2*i+1] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta );
-    
-    // lines 5--8; indeces from 48 to 95 
-    in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] );
-    in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] );
-      in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+8 );
-    
-    // lines 9--12; indeces from 96 to 143
-    in_re = _mm_set1_ps( phi[12] );
-    in_im = _mm_set1_ps( phi[13] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i+12] );
-      in_im = _mm_set1_ps( phi[2*i+13] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+16 );
-
-    // lines 13--16; indeces from 144 to 191
-    in_re = _mm_set1_ps( phi[24] );
-    in_im = _mm_set1_ps( phi[25] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i+24] );
-      in_im = _mm_set1_ps( phi[2*i+25] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+24 );
-    
-    // lines 17--20; indeces from 192 to 239 
-    in_re = _mm_setr_ps( phi[24], phi[24], phi[36], phi[36] );
-    in_im = _mm_setr_ps( phi[25], phi[25], phi[37], phi[37] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_setr_ps( phi[2*i+24], phi[2*i+24], phi[2*i+36], phi[2*i+36] );
-      in_im = _mm_setr_ps( phi[2*i+25], phi[2*i+25], phi[2*i+37], phi[2*i+37] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+32 );
-    
-    // lines 21--24; indeces from 240 to 287
-    in_re = _mm_set1_ps( phi[36] );
-    in_im = _mm_set1_ps( phi[37] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i+36] );
-      in_im = _mm_set1_ps( phi[2*i+37] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+40 );
-
-  } else {
-#endif
-    // lines 1--4; indeces from 0 to 47
-    in_re = _mm_set1_ps( phi[0] );
-    in_im = _mm_set1_ps( phi[1] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i] );
-      in_im = _mm_set1_ps( phi[2*i+1] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta );
-    
-    // lines 5--8; indeces from 48 to 95 
-    in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] );
-    in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] );
-      in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+8 );
-    
-    // lines 9--12; indeces from 96 to 143
-    in_re = _mm_set1_ps( phi[12] );
-    in_im = _mm_set1_ps( phi[13] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<6; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i+12] );
-      in_im = _mm_set1_ps( phi[2*i+13] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-    
-    sse_complex_interleaved_store( out_re, out_im, eta+16 );
-#ifdef HAVE_TM1p1
-  }
-#endif
-
-}
-
-void sse_site_clover_doublet_double( double *eta, const double *phi, const double *clover ) { 
-
-}
-
-void sse_site_clover_doublet_float( float *eta, const float *phi, float *clover ) {
-  
-  __m128 in_re;
-  __m128 in_im;
-  
-  __m128 clov_re;
-  __m128 clov_im;
-  
-  __m128 out_re;
-  __m128 out_im;
-  
-  // lines 1--4; indeces from 0 to 47
-  // lines 5--8; indeces from 48 to 95 
-  // lines 9--12; indeces from 96 to 143
-  for( int n=0; n<3; n++ ) {
-    in_re = _mm_set1_ps( phi[0] );
-    in_im = _mm_set1_ps( phi[1] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<12; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i] );
-      in_im = _mm_set1_ps( phi[2*i+1] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-  
-    sse_complex_interleaved_store( out_re, out_im, eta + n*8 );
-  }
-  
-  
-  // lines 13--16; indeces from 144 to 191
-  // lines 17--20; indeces from 192 to 239 
-  // lines 21--24; indeces from 240 to 287
-  for( int n=3; n<6; n++ ) {
-    in_re = _mm_set1_ps( phi[24] );
-    in_im = _mm_set1_ps( phi[25] );
-    clov_re = _mm_load_ps( clover );
-    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-    clover+=2*SIMD_LENGTH_float;
-    
-    for ( int i=1; i<12; i++ ) {
-      in_re = _mm_set1_ps( phi[2*i+24] );
-      in_im = _mm_set1_ps( phi[2*i+25] );
-      clov_re = _mm_load_ps( clover );
-      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-      clover+=2*SIMD_LENGTH_float;
-    }
-  
-    sse_complex_interleaved_store( out_re, out_im, eta + n*8 );
-  }
-}
-
-
-
-void sse_site_clover_invert_double( double *clover_in, double *clover_out ) { }
-
-void sse_site_clover_invert_float( float *clover_in, float *clover_out ) {
-  
-  float M_tmp1[72], M_tmp2[72];
-  
-  for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
-    for ( int j=0; j<6; j++ ) {
-      for ( int i=k; i<k+SIMD_LENGTH_float; i++ ) {
-        if ( i<6 ) {
-          M_tmp1[12*j+i] = *clover_in;
-          M_tmp1[12*j+i+6] = *(clover_in+SIMD_LENGTH_float);
-        } else {
-          M_tmp2[12*j+i-6] = *clover_in;
-          M_tmp2[12*j+i] = *(clover_in+SIMD_LENGTH_float);
-        }
-        clover_in++;
-      }
-      clover_in += SIMD_LENGTH_float;
-    }
-  }
-  
-  sse_cgem_inverse( 6, M_tmp1, M_tmp1, 6 );  
-  sse_cgem_inverse( 6, M_tmp2, M_tmp2, 6 );
-  
-  for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
-    for ( int j=0; j<6; j++ ) {
-      for ( int i=k; i<k+SIMD_LENGTH_float; i++ ) {
-        if ( i<6 ) {
-          *clover_out = M_tmp1[12*j+i];
-          *(clover_out+SIMD_LENGTH_float) = M_tmp1[12*j+i+6];
-        } else {
-          *clover_out = M_tmp2[12*j+i-6];
-          *(clover_out+SIMD_LENGTH_float) = M_tmp2[12*j+i];
-        }
-        clover_out++;
-      }
-      clover_out += SIMD_LENGTH_float;
-    }
-  }
-}
-
-void sse_site_clover_doublet_invert_double( double *clover_in, config_double eps_term, double *clover_out ) { }
-
-void sse_site_clover_doublet_invert_float( float *clover_in, config_float eps_term, float *clover_out ) {
-  
-  float M_tmp[2*288];
-
-  if(g.csw)
-    for ( int n=0; n<2; n++ )  
-      for ( int k=0; k<12; k+=SIMD_LENGTH_float )
-        for ( int j=0; j<6; j++ ) {
-          for ( int i=k; i<k+SIMD_LENGTH_float; i++ ) {
-            if(i<6) {
-              M_tmp[288*n+24*j+i] = *clover_in;
-              M_tmp[288*n+24*j+i+12] = *(clover_in+SIMD_LENGTH_float);
-            } else {
-              M_tmp[288*n+24*(j+6)+i] = *clover_in;
-              M_tmp[288*n+24*(j+6)+i+12] = *(clover_in+SIMD_LENGTH_float);
-            }
-            clover_in++;
-          }
-          clover_in += SIMD_LENGTH_float;
-        }
-  else {
-    for ( int n=0; n<2; n++ )  
-      for ( int j=0; j<6; j++ )
-        for ( int i=0; i<6; i++ ) {
-          M_tmp[288*n+24*(j+6)+i+6]  = M_tmp[288*n+24*j+i]  = 0; // re 
-          M_tmp[288*n+24*(j+6)+i+18] = M_tmp[288*n+24*j+i+12] = 0; // im
-        }  
-
-    for ( int n=0; n<2; n++ )  
-      for ( int j=0; j<12; j++ ) {
-        M_tmp[288*n+24*j+j]    = clover_in[24*n+2*j];
-        M_tmp[288*n+24*j+j+12] = clover_in[24*n+2*j+1];
-      }
-  }
-
-  for ( int n=0; n<2; n++ )  
-    for ( int j=0; j<6; j++ )
-      for ( int i=0; i<6; i++ ) {
-        M_tmp[288*n+24*(j+6)+i]    = M_tmp[288*n+24*j+i+6]  = 0; // re 
-        M_tmp[288*n+24*(j+6)+i+12] = M_tmp[288*n+24*j+i+18] = 0; // im
-      }  
-
-  for ( int n=0; n<2; n++ )  
-    for ( int j=0; j<6; j++ ) {
-      M_tmp[288*n+24*(j+6)+j]    = M_tmp[288*n+24*j+j+6]  = creal(eps_term[6*n+j]);
-      M_tmp[288*n+24*(j+6)+j+12] = M_tmp[288*n+24*j+j+18] = cimag(eps_term[6*n+j]);
-    }
-  
-  sse_cgem_inverse( 12, M_tmp, M_tmp, 12 );  
-  sse_cgem_inverse( 12, M_tmp+288, M_tmp+288, 12 );
-  
-  for ( int n=0; n<2; n++ )  
-    for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
-      for ( int j=0; j<12; j++ ) {
-        for ( int i=k; i<k+SIMD_LENGTH_float; i++ ) {
-          *clover_out = M_tmp[288*n+24*j+i];
-          *(clover_out+SIMD_LENGTH_float) = M_tmp[288*n+24*j+i+12];
-          clover_out++;
-        }
-        clover_out += SIMD_LENGTH_float;
-      }
-    }
-}
-
-
-#endif
diff --git a/src/sse_dirac.h b/src/sse_dirac.h
deleted file mode 100644
index 4e24b7c..0000000
--- a/src/sse_dirac.h
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef DIRAC_SSE_H
-#define DIRAC_SSE_H
-#ifdef SSE
-
-void prp_double( complex_double *prn[4], complex_double *phi, int start, int end );
-void prp_float( complex_float *prn[4], complex_float *phi, int start, int end );
-void prn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end );
-void prn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end );
-void pbn_double( complex_double *eta, complex_double *prp[4], int start, int end );
-void pbn_float( complex_float *eta, complex_float *prp[4], int start, int end );
-void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, int *neighbor, int start, int end );
-void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, int *neighbor, int start, int end );
-
-void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end );
-void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end );
-void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end );
-void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end );
-void dpbn_double( complex_double *eta, complex_double *prp[4], int start, int end );
-void dpbn_float( complex_float *eta, complex_float *prp[4], int start, int end );
-void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, int *neighbor, int start, int end );
-void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, int *neighbor, int start, int end );
-
-void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                         int start, int end, int *ind, int *neighbor );
-void block_oddeven_plus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                        int start, int end, int *ind, int *neighbor );
-void block_oddeven_minus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                          int start, int end, int *ind, int *neighbor );
-void block_oddeven_minus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                         int start, int end, int *ind, int *neighbor );
-void block_oddeven_nplus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                         int start, int end, int *ind, int *neighbor );
-void block_oddeven_nplus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                        int start, int end, int *ind, int *neighbor );
-void block_oddeven_nminus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                          int start, int end, int *ind, int *neighbor );
-void block_oddeven_nminus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                         int start, int end, int *ind, int *neighbor );
-void boundary_minus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                              int start, int end, int *ind, int *neighbor );
-void boundary_minus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                             int start, int end, int *ind, int *neighbor );
-void boundary_plus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                             int start, int end, int *ind, int *neighbor );
-void boundary_plus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                            int start, int end, int *ind, int *neighbor );
-void boundary_nminus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                              int start, int end, int *ind, int *neighbor );
-void boundary_nminus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                             int start, int end, int *ind, int *neighbor );
-void boundary_nplus_coupling_double( double *eta, double *D, double *phi, int mu,
-                                             int start, int end, int *ind, int *neighbor );
-void boundary_nplus_coupling_float( float *eta, float *D, float *phi, int mu,
-                                            int start, int end, int *ind, int *neighbor );
-
-void sse_set_clover_double( double *out, complex_double *in );
-void sse_set_clover_float( float *out, complex_float *in );
-void sse_set_clover_doublet_double( double *out, complex_double *in );
-void sse_set_clover_doublet_float( float *out, complex_float *in );
-void sse_add_diagonal_clover_double( double *out, complex_double *diag );
-void sse_add_diagonal_clover_float( float *out, complex_float *diag );
-void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag );
-void sse_add_diagonal_clover_doublet_float( float *out, complex_float *diag );
-void sse_clover_double( vector_double eta, vector_double phi, operator_double_struct *op, int start, int end, level_struct *l, struct Thread *threading );
-void sse_clover_float( vector_float eta, vector_float phi, operator_float_struct *op, int start, int end, level_struct *l, struct Thread *threading );
-void sse_site_clover_double( double *eta, const double *phi, const double *clover );
-void sse_site_clover_float( float *eta, const float *phi, float *clover );
-void sse_site_clover_doublet_double( double *eta, const double *phi, const double *clover );
-void sse_site_clover_doublet_float( float *eta, const float *phi, float *clover );
-
-void sse_site_clover_invert_double( double *clover_in, double *clover_out );
-void sse_site_clover_invert_float( float *clover_in, float *clover_out );
-void sse_site_clover_doublet_invert_double( double *clover_in, config_double eps_term, double *clover_out );
-void sse_site_clover_doublet_invert_float( float *clover_in, config_float eps_term, float *clover_out );
-
-
-static inline void sse_mvm_double_simd_length( const complex_double *eta, const complex_double *D, const complex_double *phi ) {}
-
-static inline void sse_mvm_float_simd_length(
-    const complex_float *eta, const complex_float *D, const complex_float *phi ) {
-#ifdef SSE
-  __m128 gauge_re;
-  __m128 gauge_im;
-  __m128 in_re[3];
-  __m128 in_im[3];
-  __m128 out_re[3];
-  __m128 out_im[3];
-
-  int elements = SIMD_LENGTH_float;
-
-  // j runs over all right-hand sides, using vectorization
-  for(int i=0; i<3; i++) {
-    in_re[i] = _mm_load_ps((float *)(phi+i*elements));
-    in_im[i] = _mm_load_ps((float *)(phi+i*elements)+elements);
-  }
-
-  for(int i=0; i<3; i++) {
-    gauge_re = _mm_set1_ps(creal(D[0+3*i]));
-    gauge_im = _mm_set1_ps(cimag(D[0+3*i]));
-    cmul(gauge_re, gauge_im, in_re[0], in_im[0], out_re+i, out_im+i);
-    gauge_re = _mm_set1_ps(creal(D[1+3*i]));
-    gauge_im = _mm_set1_ps(cimag(D[1+3*i]));
-    cfmadd(gauge_re, gauge_im, in_re[1], in_im[1], out_re+i, out_im+i);
-    gauge_re = _mm_set1_ps(creal(D[2+3*i]));
-    gauge_im = _mm_set1_ps(cimag(D[2+3*i]));
-    cfmadd(gauge_re, gauge_im, in_re[2], in_im[2], out_re+i, out_im+i);
-  }
-
-  for(int i=0; i<3; i++) {
-    _mm_store_ps((float *)(eta+i*elements),          out_re[i]);
-    _mm_store_ps((float *)(eta+i*elements)+elements, out_im[i]);
-  }
-#endif
-}
-
-
-static inline void sse_mvm_double( const complex_double *eta, const complex_double *D,
-                                   const complex_double *phi, int elements ) {}
-// spinors are vectorized, gauge is same for all (use for multiple rhs)
-static inline void sse_mvm_float( const complex_float *eta, const complex_float *D,
-                                  const complex_float *phi, int elements ) {
-#ifdef SSE
-  __m128 gauge_re;
-  __m128 gauge_im;
-  __m128 in_re[3];
-  __m128 in_im[3];
-  __m128 out_re[3];
-  __m128 out_im[3];
-
-  // j runs over all right-hand sides, using vectorization
-  for(int j=0; j<elements; j+=SIMD_LENGTH_float) {
-    for(int i=0; i<3; i++) {
-      in_re[i] = _mm_load_ps((float *)(phi+i*elements)+j);
-      in_im[i] = _mm_load_ps((float *)(phi+i*elements)+j+elements);
-    }
-
-    for(int i=0; i<3; i++) {
-      gauge_re = _mm_set1_ps(creal(D[0+3*i]));
-      gauge_im = _mm_set1_ps(cimag(D[0+3*i]));
-      cmul(gauge_re, gauge_im, in_re[0], in_im[0], out_re+i, out_im+i);
-      gauge_re = _mm_set1_ps(creal(D[1+3*i]));
-      gauge_im = _mm_set1_ps(cimag(D[1+3*i]));
-      cfmadd(gauge_re, gauge_im, in_re[1], in_im[1], out_re+i, out_im+i);
-      gauge_re = _mm_set1_ps(creal(D[2+3*i]));
-      gauge_im = _mm_set1_ps(cimag(D[2+3*i]));
-      cfmadd(gauge_re, gauge_im, in_re[2], in_im[2], out_re+i, out_im+i);
-    }
-
-    for(int i=0; i<3; i++) {
-      _mm_store_ps((float *)(eta+i*elements)+j,          out_re[i]);
-      _mm_store_ps((float *)(eta+i*elements)+j+elements, out_im[i]);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_mvmh_double( const complex_double *eta, const complex_double *D,
-                                    const complex_double *phi, int elements ) {}
-// spinors are vectorized, gauge is same for all (use for multiple rhs)
-static inline void sse_mvmh_float( const complex_float *eta, const complex_float *D,
-                                   const complex_float *phi, int elements ) {
-#ifdef SSE
-  __m128 gauge_re;
-  __m128 gauge_im;
-  __m128 in_re[3];
-  __m128 in_im[3];
-  __m128 out_re[3];
-  __m128 out_im[3];
-
-
-  // j runs over all right-hand sides, using vectorization
-  for(int j=0; j<elements; j+=SIMD_LENGTH_float) {
-    for(int i=0; i<3; i++) {
-      in_re[i] = _mm_load_ps((float *)(phi+i*elements)+j);
-      in_im[i] = _mm_load_ps((float *)(phi+i*elements)+j+elements);
-    }
-
-    for(int i=0; i<3; i++) {
-      gauge_re = _mm_set1_ps(creal(D[0+i]));
-      gauge_im = _mm_set1_ps(cimag(D[0+i]));
-      cmul_conj(gauge_re, gauge_im, in_re[0], in_im[0], out_re+i, out_im+i);
-      gauge_re = _mm_set1_ps(creal(D[3+i]));
-      gauge_im = _mm_set1_ps(cimag(D[3+i]));
-      cfmadd_conj(gauge_re, gauge_im, in_re[1], in_im[1], out_re+i, out_im+i);
-      gauge_re = _mm_set1_ps(creal(D[6+i]));
-      gauge_im = _mm_set1_ps(cimag(D[6+i]));
-      cfmadd_conj(gauge_re, gauge_im, in_re[2], in_im[2], out_re+i, out_im+i);
-    }
-
-    for(int i=0; i<3; i++) {
-      _mm_store_ps((float *)(eta+i*elements)+j,          out_re[i]);
-      _mm_store_ps((float *)(eta+i*elements)+j+elements, out_im[i]);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_twospin_double( complex_double *out_spin0and1, complex_double *out_spin2and3, const complex_double *in, int elements, int mu, double sign ) {}
-// mu is according to the enum for T,Z,Y,X defined in clifford.h
-static inline void sse_twospin_float( complex_float *out_spin0and1, complex_float *out_spin2and3, const complex_float *in, int elements, int mu, double sign ) {
-
-#ifdef SSE
-  __m128 scale_re;
-  __m128 scale_im;
-  complex_float *out;
-  // components 0-5  are subtracted from out_spin0_and1
-  // components 6-11 are subtracted from out_spin2_and3
-
-  // 6 complex components = 12
-  for(int i=0; i<12*elements; i+=SIMD_LENGTH_float) {
-    __m128 tmp = _mm_load_ps((float *)in            + i);
-    __m128 out = _mm_load_ps((float *)out_spin0and1 + i);
-
-    out = _mm_sub_ps(out, tmp);
-
-    _mm_store_ps((float *)out_spin0and1 + i, out);
-  }
-  for(int i=12*elements; i<24*elements; i+=SIMD_LENGTH_float) {
-    __m128 tmp = _mm_load_ps((float *)in            + i);
-    __m128 out = _mm_load_ps((float *)out_spin2and3 + i);
-
-    out = _mm_sub_ps(out, tmp);
-
-    _mm_store_ps((float *)out_spin2and3 + i, out);
-  }
-
-  out = out_spin2and3;
-  for(int spin=0; spin<4; spin++) {
-    if(spin == 2)
-      out = out_spin0and1;
-    scale_re = _mm_set1_ps(sign*creal(gamma_val[mu][spin]));
-    scale_im = _mm_set1_ps(sign*cimag(gamma_val[mu][spin]));
-    // factors of 2 are for complex
-    for(int j=0; j<3; j++) {
-      for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-        __m128 in_re  = _mm_load_ps((float *)in  + i + (2*(3*gamma_co[mu][spin]+j)+0)*elements);
-        __m128 in_im  = _mm_load_ps((float *)in  + i + (2*(3*gamma_co[mu][spin]+j)+1)*elements);
-        __m128 out_re = _mm_load_ps((float *)out + i + (2*(3*spin+j)+0)*elements);
-        __m128 out_im = _mm_load_ps((float *)out + i + (2*(3*spin+j)+1)*elements);
-
-        cfmadd(scale_re, scale_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)out + i + (2*(3*spin+j)+0)*elements, out_re);
-        _mm_store_ps((float *)out + i + (2*(3*spin+j)+1)*elements, out_im);
-      }
-    }
-  }
-#endif
-}
-
-
-static inline void sse_twospin2_p_double_simd_length( complex_double *out_spin0and1, complex_double *out_spin2and3, const complex_double *in, int mu ) {}
-// mu is according to the enum for T,Z,Y,X defined in clifford.h
-static inline void sse_twospin2_p_float_simd_length( complex_float *out_spin0and1, complex_float *out_spin2and3, const complex_float *in, int mu ) {
-#ifdef SSE
-  __m128 scale_re;
-  __m128 scale_im;
-//   __m128 out_re;
-//   __m128 out_im;
-  complex_float *out;
-  int elements = SIMD_LENGTH_float;
-  // components 0-5  are subtracted from out_spin0_and1
-  // components 6-11 are subtracted from out_spin2_and3
-
-  // 6 complex components = 12
-  for(int i=0; i<12*elements; i+=SIMD_LENGTH_float) {
-    __m128 tmp = _mm_load_ps((float *)in + i);
-    _mm_store_ps((float *)out_spin0and1 + i, tmp);
-  }
-  for(int i=12*elements; i<24*elements; i+=SIMD_LENGTH_float) {
-    __m128 tmp = _mm_load_ps((float *)in + i);
-    _mm_store_ps((float *)out_spin2and3 + i, tmp);
-  }
-
-  out = out_spin2and3;
-  for(int spin=0; spin<4; spin++) {
-    if(spin == 2)
-      out = out_spin0and1;
-    scale_re = _mm_set1_ps(-creal(gamma_val[mu][spin]));
-    scale_im = _mm_set1_ps(-cimag(gamma_val[mu][spin]));
-    // factors of 2 are for complex
-    for(int j=0; j<3; j++) {
-      __m128 in_re  = _mm_load_ps((float *)in   + (2*(3*gamma_co[mu][spin]+j)+0)*elements);
-      __m128 in_im  = _mm_load_ps((float *)in   + (2*(3*gamma_co[mu][spin]+j)+1)*elements);
-      __m128 out_re = _mm_load_ps((float *)out  + (2*(3*spin+j)+0)*elements);
-      __m128 out_im = _mm_load_ps((float *)out  + (2*(3*spin+j)+1)*elements);
-
-      cmul(scale_re, scale_im, in_re, in_im, &out_re, &out_im);
-
-      _mm_store_ps((float *)out  + (2*(3*spin+j)+0)*elements, out_re);
-      _mm_store_ps((float *)out  + (2*(3*spin+j)+1)*elements, out_im);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_twospin2_p_double( complex_double *out_spin0and1, complex_double *out_spin2and3, const complex_double *in, int elements, int mu ) {}
-// mu is according to the enum for T,Z,Y,X defined in clifford.h
-static inline void sse_twospin2_p_float( complex_float *out_spin0and1, complex_float *out_spin2and3, const complex_float *in, int elements, int mu ) {
-
-#ifdef SSE
-  __m128 scale_re;
-  __m128 scale_im;
-//   __m128 out_re;
-//   __m128 out_im;
-  complex_float *out;
-  // components 0-5  are subtracted from out_spin0_and1
-  // components 6-11 are subtracted from out_spin2_and3
-
-  // 6 complex components = 12
-  for(int i=0; i<12*elements; i+=SIMD_LENGTH_float) {
-    __m128 tmp = _mm_load_ps((float *)in + i);
-    _mm_store_ps((float *)out_spin0and1 + i, tmp);
-  }
-  for(int i=12*elements; i<24*elements; i+=SIMD_LENGTH_float) {
-    __m128 tmp = _mm_load_ps((float *)in + i);
-    _mm_store_ps((float *)out_spin2and3 + i, tmp);
-  }
-
-  out = out_spin2and3;
-  for(int spin=0; spin<4; spin++) {
-    if(spin == 2)
-      out = out_spin0and1;
-    scale_re = _mm_set1_ps(-creal(gamma_val[mu][spin]));
-    scale_im = _mm_set1_ps(-cimag(gamma_val[mu][spin]));
-    // factors of 2 are for complex
-    for(int j=0; j<3; j++) {
-      for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-        __m128 in_re  = _mm_load_ps((float *)in  + i + (2*(3*gamma_co[mu][spin]+j)+0)*elements);
-        __m128 in_im  = _mm_load_ps((float *)in  + i + (2*(3*gamma_co[mu][spin]+j)+1)*elements);
-        __m128 out_re = _mm_load_ps((float *)out + i + (2*(3*spin+j)+0)*elements);
-        __m128 out_im = _mm_load_ps((float *)out + i + (2*(3*spin+j)+1)*elements);
-
-        cmul(scale_re, scale_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)out + i + (2*(3*spin+j)+0)*elements, out_re);
-        _mm_store_ps((float *)out + i + (2*(3*spin+j)+1)*elements, out_im);
-      }
-    }
-  }
-#endif
-}
-
-
-static inline void sse_spin0and1_site_clover_double( const complex_double *eta, const complex_double *phi, const config_double clover, double shift, int elements ) {}
-
-static inline void sse_spin0and1_site_clover_float( const complex_float *eta, const complex_float *phi, const config_float clover, double shift, int elements ) {
-#ifdef SSE
-  // offset computations 2*index+0/1 are for real and imaginary parts
-
-  // diagonal
-  if ( g.csw == 0.0 ) {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int j=0; j<6; j++) {
-        __m128 factor = _mm_set1_ps((float)shift);
-        __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
-        __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
-
-        in_re = _mm_mul_ps( factor, in_re );
-        in_im = _mm_mul_ps( factor, in_im );
-
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, in_re);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, in_im);
-      }
-      __m128 zero = _mm_setzero_ps();
-      for(int j=6; j<12; j++) {
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, zero);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, zero);
-      }
-    }
-  } else {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int j=0; j<6; j++) {
-        __m128 clover_re = _mm_set1_ps(creal(clover[j]));
-        __m128 clover_im = _mm_set1_ps(cimag(clover[j]));
-        __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
-        __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
-        __m128 out_re; __m128 out_im;
-        
-        cmul(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, out_re);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, out_im);
-      }
-      __m128 zero = _mm_setzero_ps();
-      for(int j=6; j<12; j++) {
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, zero);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, zero);
-      }
-    }
-
-    // spin 0 and 1
-    __m128 clover_re;
-    __m128 clover_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-
-      // io = index out, ii = index in, ic = index clover
-      int ic = 12;
-      for(int io=0; io<5; io++) {
-        out_re = _mm_load_ps((float *)eta + i + (2*io+0)*elements);
-        out_im = _mm_load_ps((float *)eta + i + (2*io+1)*elements);
-        for(int ii=io+1; ii<=5; ii++) {
-          clover_re = _mm_set1_ps(creal(clover[ic]));
-          clover_im = _mm_set1_ps(cimag(clover[ic]));
-          ic++;
-          in_re  = _mm_load_ps((float *)phi + i + (2*ii+0)*elements);
-          in_im  = _mm_load_ps((float *)phi + i + (2*ii+1)*elements);
-
-          cfmadd(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-        }
-        _mm_store_ps((float *)eta + i + (2*io+0)*elements, out_re);
-        _mm_store_ps((float *)eta + i + (2*io+1)*elements, out_im);
-      }
-
-      ic = 12;
-      for(int ii=0; ii<5; ii++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*ii+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*ii+1)*elements);
-        for(int io=ii+1; io<=5; io++) {
-          clover_re = _mm_set1_ps(creal(clover[ic]));
-          clover_im = _mm_set1_ps(cimag(clover[ic]));
-          ic++;
-          out_re = _mm_load_ps((float *)eta + i + (2*io+0)*elements);
-          out_im = _mm_load_ps((float *)eta + i + (2*io+1)*elements);
-
-          cfmadd_conj(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta + i + (2*io+0)*elements, out_re);
-          _mm_store_ps((float *)eta + i + (2*io+1)*elements, out_im);
-        }
-      }
-    }
-  }
-#endif
-}
-
-static inline void sse_diagonal_aggregate_double( const complex_double *eta1, const complex_double *eta2, const complex_double *phi, const config_double diag, int elements ) {}
-
-static inline void sse_diagonal_aggregate_float( const complex_float *eta1, const complex_float *eta2, const complex_float *phi, const config_float diag, int elements ) {
-#ifdef SSE
-  // offset computations 2*index+0/1 are for real and imaginary parts
-
-  // diagonal
-  for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-    __m128 zero = _mm_setzero_ps();
-    for(int j=0; j<6; j++) {
-      __m128 factor = _mm_set1_ps(creal(diag[j]));
-      __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
-      __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
-      
-      in_re = _mm_mul_ps( factor, in_re );
-      in_im = _mm_mul_ps( factor, in_im );
-      
-      _mm_store_ps((float *)eta1 + i + (2*j+0)*elements, in_re);
-      _mm_store_ps((float *)eta1 + i + (2*j+1)*elements, in_im);
-      _mm_store_ps((float *)eta2 + i + (2*j+0)*elements, zero);
-      _mm_store_ps((float *)eta2 + i + (2*j+1)*elements, zero);
-    }
-    for(int j=6; j<12; j++) {
-      __m128 factor = _mm_set1_ps(creal(diag[j]));
-      __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
-      __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
-      
-      in_re = _mm_mul_ps( factor, in_re );
-      in_im = _mm_mul_ps( factor, in_im );
-      
-      _mm_store_ps((float *)eta2 + i + (2*j+0)*elements, in_re);
-      _mm_store_ps((float *)eta2 + i + (2*j+1)*elements, in_im);
-      _mm_store_ps((float *)eta1 + i + (2*j+0)*elements, zero);
-      _mm_store_ps((float *)eta1 + i + (2*j+1)*elements, zero);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_spin2and3_site_clover_double( const complex_double *eta, const complex_double *phi, const config_double clover, double shift, int elements ) {}
-
-static inline void sse_spin2and3_site_clover_float( const complex_float *eta, const complex_float *phi, const config_float clover, double shift, int elements ) {
-#ifdef SSE
-  // offset computations 2*index+0/1 are for real and imaginary parts
-
-  // diagonal
-  if ( g.csw == 0.0 ) {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      __m128 zero = _mm_setzero_ps();
-      for(int j=0; j<6; j++) {
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, zero);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, zero);
-      }
-      for(int j=6; j<12; j++) {
-        __m128 factor = _mm_set1_ps((float)shift);
-        __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
-        __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
-
-        in_re = _mm_mul_ps( factor, in_re );
-        in_im = _mm_mul_ps( factor, in_im );
-
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, in_re);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, in_im);
-      }
-    }
-    
-    
-    
-  } else {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      __m128 zero = _mm_setzero_ps();
-      for(int j=0; j<6; j++) {
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, zero);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, zero);
-      }
-      for(int j=6; j<12; j++) {
-        __m128 clover_re = _mm_set1_ps(creal(clover[j]));
-        __m128 clover_im = _mm_set1_ps(cimag(clover[j]));
-        __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
-        __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
-        __m128 out_re; __m128 out_im;
-
-        cmul(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)eta + i + (2*j+0)*elements, out_re);
-        _mm_store_ps((float *)eta + i + (2*j+1)*elements, out_im);
-      }
-    }
-
-    // spin 0 and 1
-    __m128 clover_re;
-    __m128 clover_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-
-      // io = index out, ii = index in, ic = index clover
-      int ic = 27;
-      for(int io=6; io<11; io++) {
-        out_re = _mm_load_ps((float *)eta + i + (2*io+0)*elements);
-        out_im = _mm_load_ps((float *)eta + i + (2*io+1)*elements);
-        for(int ii=io+1; ii<=11; ii++) {
-          clover_re = _mm_set1_ps(creal(clover[ic]));
-          clover_im = _mm_set1_ps(cimag(clover[ic]));
-          ic++;
-          in_re  = _mm_load_ps((float *)phi + i + (2*ii+0)*elements);
-          in_im  = _mm_load_ps((float *)phi + i + (2*ii+1)*elements);
-
-          cfmadd(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-        }
-        _mm_store_ps((float *)eta + i + (2*io+0)*elements, out_re);
-        _mm_store_ps((float *)eta + i + (2*io+1)*elements, out_im);
-      }
-
-      ic = 27;
-      for(int ii=6; ii<11; ii++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*ii+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*ii+1)*elements);
-        for(int io=ii+1; io<=11; io++) {
-          clover_re = _mm_set1_ps(creal(clover[ic]));
-          clover_im = _mm_set1_ps(cimag(clover[ic]));
-          ic++;
-          out_re = _mm_load_ps((float *)eta + i + (2*io+0)*elements);
-          out_im = _mm_load_ps((float *)eta + i + (2*io+1)*elements);
-
-          cfmadd_conj(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta + i + (2*io+0)*elements, out_re);
-          _mm_store_ps((float *)eta + i + (2*io+1)*elements, out_im);
-        }
-      }
-    }
-  }
-#endif
-}
-
-
-#endif
-#endif // DIRAC_SSE_H
diff --git a/src/sse_dirac_su3local.h b/src/sse_dirac_su3local.h
deleted file mode 100644
index 8e1f8ad..0000000
--- a/src/sse_dirac_su3local.h
+++ /dev/null
@@ -1,493 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifdef HAVE_TM1p1
-  if( g.n_flavours == 2 ) {
-#ifdef BOUNDARY  
-    for ( int i=start; i<end; i+=2 ) {
-#else
-    for ( int i=start; i<end; i++ ) {
-#endif
-        
-#ifdef MINUSDIR
-#define CURRENT_SIGN (+1)
-#define CURRENT_UPDATE _mm_add_ps
-#define CURRENT_MUL cmul_conj
-#define CURRENT_MADD cfmadd_conj
-#else
-#define CURRENT_SIGN (-1)
-#define CURRENT_UPDATE _mm_sub_ps
-#define CURRENT_MUL cmul
-#define CURRENT_MADD cfmadd
-#endif
-    
-      __m128 res11[2];
-      __m128 res21[2];
-      __m128 res12[2];
-      __m128 res22[2];
-      
-      {
-#ifdef BOUNDARY
-        float *pt = phi+48*ind[i+1];
-#else
-#ifdef MINUSDIR
-        float *pt = phi+48*ind[i];
-#else
-        float *pt = phi+48*neighbor[4*ind[i]+MU];
-#endif
-#endif
-        // calc spin0 flav1 projection
-        res11[0] = _mm_setr_ps( pt[0], pt[2], pt[4], 0 );
-        res11[1] = _mm_setr_ps( index_d_re(pt,MU,0), index_d_re(pt+2,MU,0), index_d_re(pt+4,MU,0), 0 );
-        res21[0] = _mm_setr_ps( pt[1], pt[3], pt[5], 0 );
-        res21[1] = _mm_setr_ps( index_d_im(pt,MU,0), index_d_im(pt+2,MU,0), index_d_im(pt+4,MU,0), 0 );
-        __m128 in11_re = CURRENT_UPDATE( res11[0], res11[1] );
-        __m128 in11_im = CURRENT_UPDATE( res21[0], res21[1] );
-        
-        // calc spin1 flav1 projection
-        res11[0] = _mm_setr_ps( pt[6], pt[8], pt[10], 0 );
-        res11[1] = _mm_setr_ps( index_d_re(pt,MU,1), index_d_re(pt+2,MU,1), index_d_re(pt+4,MU,1), 0 );
-        res21[0] = _mm_setr_ps( pt[7], pt[9], pt[11], 0 );
-        res21[1] = _mm_setr_ps( index_d_im(pt,MU,1), index_d_im(pt+2,MU,1), index_d_im(pt+4,MU,1), 0 );   
-        __m128 in21_re = CURRENT_UPDATE( res11[0], res11[1] );
-        __m128 in21_im = CURRENT_UPDATE( res21[0], res21[1] );
-        
-        // calc spin0 flav2 projection
-        res12[0] = _mm_setr_ps( pt[12], pt[14], pt[16], 0 );
-        res12[1] = _mm_setr_ps( index_d_re(pt+12,MU,0), index_d_re(pt+14,MU,0), index_d_re(pt+16,MU,0), 0 );
-        res22[0] = _mm_setr_ps( pt[13], pt[15], pt[17], 0 );
-        res22[1] = _mm_setr_ps( index_d_im(pt+12,MU,0), index_d_im(pt+14,MU,0), index_d_im(pt+16,MU,0), 0 );
-        __m128 in12_re = CURRENT_UPDATE( res12[0], res12[1] );
-        __m128 in12_im = CURRENT_UPDATE( res22[0], res22[1] );
-        
-        // calc spin1 flav2 projection
-        res12[0] = _mm_setr_ps( pt[18], pt[20], pt[22], 0 );
-        res12[1] = _mm_setr_ps( index_d_re(pt+12,MU,1), index_d_re(pt+14,MU,1), index_d_re(pt+16,MU,1), 0 );
-        res22[0] = _mm_setr_ps( pt[19], pt[21], pt[23], 0 );
-        res22[1] = _mm_setr_ps( index_d_im(pt+12,MU,1), index_d_im(pt+14,MU,1), index_d_im(pt+16,MU,1), 0 );
-        __m128 in22_re = CURRENT_UPDATE( res12[0], res12[1] );
-        __m128 in22_im = CURRENT_UPDATE( res22[0], res22[1] );
-        
-        { // perform su(3) matrix vector multiplication
-#ifdef BOUNDARY
-#ifdef MINUSDIR
-          pt = D + 96*ind[i+1] + 24*MU;
-#else
-          pt = D + 96*ind[i] + 24*MU;
-#endif
-#else
-          pt = D + 96*ind[i] + 24*MU;
-#endif
-          // load 1st part of su(3) matrix and multiply
-          {          
-            __m128 buf1 = _mm_loadu_ps( pt );
-            __m128 buf2 = _mm_loadu_ps( pt+SIMD_LENGTH_float );
-            {
-              __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) );
-              __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) );
-              CURRENT_MUL( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) );
-              __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) );
-              CURRENT_MUL( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) );
-              __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) );
-              CURRENT_MUL( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) );
-              __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) );
-              CURRENT_MUL( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
-            }
-          }
-          // load 2nd part of su(3) matrix and multiply
-          {
-            __m128 buf1 = _mm_loadu_ps( pt+2*SIMD_LENGTH_float );
-            __m128 buf2 = _mm_loadu_ps( pt+3*SIMD_LENGTH_float );
-            {
-              __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) );
-              __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) );
-              __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) );
-              __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) );
-              __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
-            }
-          }
-          // load 3rd part of su(3) matrix and multiply
-          {
-            __m128 buf1 = _mm_loadu_ps( pt+4*SIMD_LENGTH_float );
-            __m128 buf2 = _mm_loadu_ps( pt+5*SIMD_LENGTH_float );
-            {
-              __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) );
-              __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) );
-              __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) );
-              __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
-            }
-            {
-              __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) );
-              __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) );
-              CURRENT_MADD( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
-            }
-          }
-        }
-      }
-      
-      { // store result
-#ifdef BOUNDARY
-        float *pt = eta+48*ind[i];
-#else
-#ifdef MINUSDIR
-        float *pt = eta+48*neighbor[4*ind[i]+MU];
-#else
-        float *pt = eta+48*ind[i];
-#endif
-#endif
-        {
-          
-          // store spin0 flav1 contribution
-          __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] );
-          __m128 buf3 = _mm_loadu_ps( pt );
-          __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] );
-          __m128 buf4 = _mm_loadu_ps( pt+4 );
-          buf3 = UPD( buf3, buf1 );
-          buf4 = UPD( buf4, buf2 );
-          _mm_storeu_ps( pt, buf3 );
-          _mm_storeu_ps( pt+4, buf4 );
-        
-          { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-            __m128 *buf[2] = {&buf3,&buf4};
-            *buf[gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][0]] );
-            *buf[1-gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][0]] );
-            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][0]],   res11[gamma_offset[MU][0]] );
-            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][0]], res11[1-gamma_offset[MU][0]] );
-          }
-          
-          buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)   );
-          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+2 );
-          res11[0] = UPD( buf1, buf3 );
-          res11[1] = UPD( buf2, buf4 );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2),   res11[0] );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+2, res11[1] );
-          
-          // store spin1 contribution
-          buf3 = _mm_unpacklo_ps( res21[0], res21[1] );
-          buf1 = _mm_loadu_ps( pt+6 );
-          buf4 = _mm_unpackhi_ps( res21[0], res21[1] );
-          buf2 = _mm_loadu_ps( pt+10 );
-          buf1 = UPD( buf1, buf3 );
-          buf2 = UPD( buf2, buf4 );
-          _mm_storeu_ps( pt+6, buf1 );
-          _mm_storeu_ps( pt+10, buf2 );
-          
-          { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-            __m128 *buf[2] = {&buf3,&buf4};
-            *buf[gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][1]] );
-            *buf[1-gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][1]] );
-            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][1]], res21[gamma_offset[MU][1]] );
-            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][1]], res21[1-gamma_offset[MU][1]] );
-          }
-          
-          buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)   );
-          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+2 );
-          res21[0] = UPD( buf1, buf3 );
-          res21[1] = UPD( buf2, buf4 );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2),   res21[0] );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+2, res21[1] );
-        }
-        {
-          // store spin0 flav2 contribution
-          __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] );
-          __m128 buf3 = _mm_loadu_ps( pt+12 );
-          __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] );
-          __m128 buf4 = _mm_loadu_ps( pt+16 );
-          buf3 = UPD( buf3, buf1 );
-          buf4 = UPD( buf4, buf2 );
-          _mm_storeu_ps( pt+12, buf3 );
-          _mm_storeu_ps( pt+16, buf4 );
-          
-           { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-            __m128 *buf[2] = {&buf3,&buf4};
-            *buf[gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][0]] );
-            *buf[1-gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][0]] );
-            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][0]], res12[gamma_offset[MU][0]] );
-            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][0]], res12[1-gamma_offset[MU][0]] );
-          }
-          
-          buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+12 );
-          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+14 );
-          res12[0] = UPD( buf1, buf3 );
-          res12[1] = UPD( buf2, buf4 );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+12, res12[0] );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+14, res12[1] );
-          
-          // store spin1 contribution
-          buf3 = _mm_unpacklo_ps( res22[0], res22[1] );
-          buf1 = _mm_loadu_ps( pt+18 );
-          buf4 = _mm_unpackhi_ps( res22[0], res22[1] );
-          buf2 = _mm_loadu_ps( pt+22 );
-          buf1 = UPD( buf1, buf3 );
-          buf2 = UPD( buf2, buf4 );
-          _mm_storeu_ps( pt+18, buf1 );
-          _mm_storeu_ps( pt+22, buf2 );
-          
-          { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-            __m128 *buf[2] = {&buf3,&buf4};
-            *buf[gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][1]] );
-            *buf[1-gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][1]] );
-            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][1]], res22[gamma_offset[MU][1]] );
-            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][1]], res22[1-gamma_offset[MU][1]] );
-          }
-          
-          buf3 = _mm_unpacklo_ps( buf1, buf2 );
-          buf4 = _mm_unpackhi_ps( buf1, buf2 );
-          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+12 );
-          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+14 );
-          res22[0] = UPD( buf1, buf3 );
-          res22[1] = UPD( buf2, buf4 );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+12, res22[0] );
-          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+14, res22[1] );
-        }
-      }
-#undef CURRENT_SIGN
-#undef CURRENT_UPDATE
-#undef CURRENT_MUL
-#undef CURRENT_MADD
-#undef LINE1
-#undef LINE2
-#undef LINE3
-    }
-  }else
-#endif
-#ifdef BOUNDARY  
-  for ( int i=start; i<end; i+=2 ) {
-#else
-  for ( int i=start; i<end; i++ ) {
-#endif
-    
-#ifdef MINUSDIR
-#define CURRENT_SIGN (+1)
-#define CURRENT_UPDATE _mm_add_ps
-#define CURRENT_MUL cmul_conj
-#define CURRENT_MADD cfmadd_conj
-#else
-#define CURRENT_SIGN (-1)
-#define CURRENT_UPDATE _mm_sub_ps
-#define CURRENT_MUL cmul
-#define CURRENT_MADD cfmadd
-#endif
-    
-    __m128 res1[2];
-    __m128 res2[2];
-    
-    {
-#ifdef BOUNDARY
-      float *pt = phi+24*ind[i+1];
-#else
-#ifdef MINUSDIR
-      float *pt = phi+24*ind[i];
-#else
-      float *pt = phi+24*neighbor[4*ind[i]+MU];
-#endif
-#endif
-      // calc spin0 projection
-      res1[0] = _mm_setr_ps( pt[0], pt[2], pt[4], 0 );
-      res1[1] = _mm_setr_ps( index_re(pt,MU,0), index_re(pt+2,MU,0), index_re(pt+4,MU,0), 0 );
-      res2[0] = _mm_setr_ps( pt[1], pt[3], pt[5], 0 );
-      res2[1] = _mm_setr_ps( index_im(pt,MU,0), index_im(pt+2,MU,0), index_im(pt+4,MU,0), 0 );
-      __m128 in1_re = CURRENT_UPDATE( res1[0], res1[1] );
-      __m128 in1_im = CURRENT_UPDATE( res2[0], res2[1] );
-      
-      // calc spin1 projection
-      res1[0] = _mm_setr_ps( pt[6], pt[8], pt[10], 0 );
-      res1[1] = _mm_setr_ps( index_re(pt,MU,1), index_re(pt+2,MU,1), index_re(pt+4,MU,1), 0 );
-      res2[0] = _mm_setr_ps( pt[7], pt[9], pt[11], 0 );
-      res2[1] = _mm_setr_ps( index_im(pt,MU,1), index_im(pt+2,MU,1), index_im(pt+4,MU,1), 0 );   
-      __m128 in2_re = CURRENT_UPDATE( res1[0], res1[1] );
-      __m128 in2_im = CURRENT_UPDATE( res2[0], res2[1] );
-      
-      { // perform su(3) matrix vector multiplication
-#ifdef BOUNDARY
-#ifdef MINUSDIR
-        pt = D + 96*ind[i+1] + 24*MU;
-#else
-        pt = D + 96*ind[i] + 24*MU;
-#endif
-#else
-        pt = D + 96*ind[i] + 24*MU;
-#endif
-        // load 1st part of su(3) matrix and multiply
-        {          
-          __m128 buf1 = _mm_loadu_ps( pt );
-          __m128 buf2 = _mm_loadu_ps( pt+SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(0,0,0,0) );
-            CURRENT_MUL( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(0,0,0,0) );
-            __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(0,0,0,0) );
-            CURRENT_MUL( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-        // load 2nd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( pt+2*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( pt+3*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(1,1,1,1) );
-            CURRENT_MADD( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(1,1,1,1) );
-            __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(1,1,1,1) );
-            CURRENT_MADD( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-        // load 3rd part of su(3) matrix and multiply
-        {
-          __m128 buf1 = _mm_loadu_ps( pt+4*SIMD_LENGTH_float );
-          __m128 buf2 = _mm_loadu_ps( pt+5*SIMD_LENGTH_float );
-          {
-            __m128 buf3 = _mm_shuffle_ps( in1_re, in1_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in1_im, in1_im, _MM_SHUFFLE(2,2,2,2) );
-            CURRENT_MADD( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
-          }
-          {
-            __m128 buf3 = _mm_shuffle_ps( in2_re, in2_re, _MM_SHUFFLE(2,2,2,2) );
-            __m128 buf4 = _mm_shuffle_ps( in2_im, in2_im, _MM_SHUFFLE(2,2,2,2) );
-            CURRENT_MADD( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
-          }
-        }
-      }
-    }
-    
-    { // store result
-#ifdef BOUNDARY
-      float *pt = eta+24*ind[i];
-#else
-#ifdef MINUSDIR
-      float *pt = eta+24*neighbor[4*ind[i]+MU];
-#else
-      float *pt = eta+24*ind[i];
-#endif
-#endif
-      {
-        // store spin0 contribution
-        __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] );
-        __m128 buf3 = _mm_loadu_ps( pt );
-        __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] );
-        __m128 buf4 = _mm_loadu_ps( pt+4 );
-        buf3 = UPD( buf3, buf1 );
-        buf4 = UPD( buf4, buf2 );
-        _mm_storeu_ps( pt, buf3 );
-        _mm_storeu_ps( pt+4, buf4 );
-        
-        { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-          __m128 *buf[2] = {&buf3,&buf4};
-          *buf[gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][0]] );
-          *buf[1-gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][0]] );
-          buf1 = _mm_mul_ps( *buf[gamma_offset[MU][0]], res1[gamma_offset[MU][0]] );
-          buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][0]], res1[1-gamma_offset[MU][0]] );
-        }
-        
-        buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]   );
-        buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+2 );
-        res1[0] = UPD( buf1, buf3 );
-        res1[1] = UPD( buf2, buf4 );
-        _mm_storeu_ps( pt + 6*gamma_co[MU][0], res1[0] );
-        _mm_storeu_ps( pt + 6*gamma_co[MU][0]+2, res1[1] );
-        
-        // store spin1 contribution
-        buf3 = _mm_unpacklo_ps( res2[0], res2[1] );
-        buf1 = _mm_loadu_ps( pt+6 );
-        buf4 = _mm_unpackhi_ps( res2[0], res2[1] );
-        buf2 = _mm_loadu_ps( pt+10 );
-        buf1 = UPD( buf1, buf3 );
-        buf2 = UPD( buf2, buf4 );
-        _mm_storeu_ps( pt+6, buf1 );
-        _mm_storeu_ps( pt+10, buf2 );
-        
-        { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-          __m128 *buf[2] = {&buf3,&buf4};
-          *buf[gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][1]] );
-          *buf[1-gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][1]] );
-          buf1 = _mm_mul_ps( *buf[gamma_offset[MU][1]], res2[gamma_offset[MU][1]] );
-          buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][1]], res2[1-gamma_offset[MU][1]] );
-        }
-        
-        buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]   );
-        buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+2 );
-        res2[0] = UPD( buf1, buf3 );
-        res2[1] = UPD( buf2, buf4 );
-        _mm_storeu_ps( pt + 6*gamma_co[MU][1], res2[0] );
-        _mm_storeu_ps( pt + 6*gamma_co[MU][1]+2, res2[1] );
-      }
-    }
-#undef CURRENT_SIGN
-#undef CURRENT_UPDATE
-#undef CURRENT_MUL
-#undef CURRENT_MADD
-#undef LINE1
-#undef LINE2
-#undef LINE3
-  }
diff --git a/src/sse_double_intrinsic.h b/src/sse_double_intrinsic.h
deleted file mode 100644
index e191b16..0000000
--- a/src/sse_double_intrinsic.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef DOUBLE_INTRINSIC_SSE_H
-#define DOUBLE_INTRINSIC_SSE_H
-
-#ifdef SSE
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-// res = a*b + c
-static inline __m128d sse_fmadd_pd( __m128d a, __m128d b, __m128d c ) {
-  __m128d res;
-  res = _mm_mul_pd( a, b );
-  res = _mm_add_pd( res, c );
-  return res;
-}
-
-// res = -a*b + c
-static inline __m128d sse_fnmadd_pd( __m128d a, __m128d b, __m128d c ) {
-  __m128d res;
-  res = _mm_mul_pd( a, b );
-  res = _mm_sub_pd( c, res );
-  return res;
-}
-
-// res = a*b - c
-static inline __m128d sse_fmsub_pd( __m128d a, __m128d b, __m128d c ) {
-  __m128d res;
-  res = _mm_mul_pd( a, b );
-  res = _mm_sub_pd( res, c );
-  return res;
-}
-
-static inline double sse_reduce_add_pd( __m128d data ) {
-  double result;
-  data = _mm_add_pd( data, _mm_unpackhi_pd( data, data ) );
-  _mm_store_sd( &result, data );
-  return result;
-}
-
-#endif
-#endif
\ No newline at end of file
diff --git a/src/sse_float_intrinsic.h b/src/sse_float_intrinsic.h
deleted file mode 100644
index 33220ba..0000000
--- a/src/sse_float_intrinsic.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef FLOAT_INTRINSIC_SSE_H
-#define FLOAT_INTRINSIC_SSE_H
-
-#ifdef SSE
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-// res = a*b + c
-static inline __m128 sse_fmadd( __m128 a, __m128 b, __m128 c ) {
-  __m128 res;
-  res = _mm_mul_ps( a, b );
-  res = _mm_add_ps( res, c );
-  return res;
-}
-
-// res = -a*b + c
-static inline __m128 sse_fnmadd( __m128 a, __m128 b, __m128 c ) {
-  __m128 res;
-  res = _mm_mul_ps( a, b );
-  res = _mm_sub_ps( c, res );
-  return res;
-}
-
-// res = a*b - c
-static inline __m128 sse_fmsub( __m128 a, __m128 b, __m128 c ) {
-  __m128 res;
-  res = _mm_mul_ps( a, b );
-  res = _mm_sub_ps( res, c );
-  return res;
-}
-
-// res = -a*b - c
-static inline __m128 sse_fnmsub( __m128 a, __m128 b, __m128 c ) {
-  __m128 res; __m128 minus_a;
-  minus_a = _mm_setzero_ps();
-  minus_a = _mm_sub_ps( minus_a, a );
-  res = _mm_mul_ps( minus_a, b );
-  res = _mm_sub_ps( res, c );
-  return res;
-}
-
-static inline void transpose_4_registers( __m128 *data)
-{
-   __m128 tmp[4];
-
-   tmp[0] = _mm_unpacklo_ps( data[0], data[1] );
-   tmp[1] = _mm_unpacklo_ps( data[2], data[3] );
-   tmp[2] = _mm_unpackhi_ps( data[0], data[1] );
-   tmp[3] = _mm_unpackhi_ps( data[2], data[3] );
-
-   data[0] = _mm_movelh_ps( tmp[0], tmp[1] );
-   data[1] = _mm_movehl_ps( tmp[1], tmp[0] );
-   data[2] = _mm_movelh_ps( tmp[2], tmp[3] );
-   data[3] = _mm_movehl_ps( tmp[3], tmp[2] );
-}
-
-
-static inline float sse_reduce_add_ps( __m128 data ) {
-  float result;
-  
-  __m128 tmp;
-  tmp = _mm_add_ps( data, _mm_movehl_ps( data, data ) );
-  data = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-  _mm_store_ss( &result, data );
-  
-  return result;
-}
-
-#endif
-
-#endif // FLOAT_INTRINSIC_SSE_H
diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c
deleted file mode 100644
index bd5f56a..0000000
--- a/src/sse_interpolation_generic.c
+++ /dev/null
@@ -1,669 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#if defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION )
-
-void interpolation_PRECISION_alloc( level_struct *l ) {
-  
-  int k, n = l->num_eig_vect;
-  
-  MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
-  MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n );
-  
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION  
-  MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n );
-  l->is_PRECISION.interpolation[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 128 );
-  for ( k=1; k<n; k++ )
-    l->is_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size;
-#endif
-  // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size
-  MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION,
-                    ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 );
-
-  l->is_PRECISION.test_vector[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 );
-  for ( k=1; k<n; k++ ) {
-    l->is_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size;
-  }    
-}
-
-
-void interpolation_PRECISION_dummy_alloc( level_struct *l ) {
-  
-  MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect );
-  MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect );  
-}
-
-
-void interpolation_PRECISION_dummy_free( level_struct *l ) {
-  
-  FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect );
-  FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect );  
-}
-
-
-void interpolation_PRECISION_free( level_struct *l ) {
-  
-  int n = l->num_eig_vect;
-  
-  FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size );
-  FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
-  FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n );
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION  
-  FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size );
-  FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n );
-#endif
-  FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size );
-}
-
-
-void swap8_PRECISION( PRECISION* data ) {
-  
-  int i;
-  PRECISION tmp[8];
-  
-  for ( i=0; i<4; i++ ) {
-    tmp[i] = data[2*i];
-    tmp[i+4] = data[2*i+1];
-  }
-  
-  for ( i=0; i<8; i++ ) {
-    data[i] = tmp[i];
-  }
-}
-
-
-void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) {
-  
-  int j, num_eig_vect = l->num_eig_vect;
-  complex_PRECISION *operator = l->is_PRECISION.operator;
-
-  int start = threading->start_index[l->depth];
-  int end = threading->end_index[l->depth];
-      
-  SYNC_CORES(threading)
-  int offset = SIMD_LENGTH_PRECISION;
-  for ( j=0; j<num_eig_vect; j+=offset ) {
-    int j_end = j+offset;
-    if(j_end > num_eig_vect)
-      j_end = num_eig_vect;
-    
-    operator = l->is_PRECISION.operator + j*l->vector_size + start*offset;
-    
-    for ( int i=start; i<end; i+=offset/2 ) {
-      __m128 data[offset];
-      for ( int j2=j; j2<j_end; j2++ )
-        data[j2-j] = _mm_load_ps((float *)(interpolation[j2]+i));
-      for ( int j2=j_end; j2<j+offset; j2++ )
-        data[j2-j] = _mm_setzero_ps();
-      
-      transpose_4_registers(data);
-
-      for ( int k=0; k<offset; k++) {
-        _mm_store_ps((float *)operator, data[k]);
-        // operator type is complex, so offset only by SIMD_LENGTH_PRECISION over *two*
-        operator += offset/2;
-      }
-    }
-  }
-  SYNC_CORES(threading)
-}
-
-
-void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-      num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-                    
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-  
-#ifdef HAVE_TM1p1
-  if( g.n_flavours==2 )  
-    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
-      float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-      __m128 zero =  _mm_setzero_ps();
-      for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
-        _mm_store_ps(tmp_phi1_c_re+j, zero);
-        _mm_store_ps(tmp_phi1_c_im+j, zero);
-        _mm_store_ps(tmp_phi2_c_re+j, zero);
-        _mm_store_ps(tmp_phi2_c_im+j, zero);
-      }
-      // copy phi_c into temporary
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi1_c_re[j] = creal(phi_c_pt[j]);
-        tmp_phi1_c_im[j] = cimag(phi_c_pt[j]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi2_c_re[j] = creal(phi_c_pt[j+num_eig_vect]);
-        tmp_phi2_c_im[j] = cimag(phi_c_pt[j+num_eig_vect]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi1_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+2*num_eig_vect]);
-        tmp_phi1_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+2*num_eig_vect]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi2_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+3*num_eig_vect]);
-        tmp_phi2_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+3*num_eig_vect]);
-      }
-      
-      int offset = SIMD_LENGTH_PRECISION;
-      // loop over blocks of SIMD_LENGTH_PRECISION vectors
-      for ( j=0; j<num_eig_vect; j+=offset ) {
-        phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
-        operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites;
-        
-        for ( k=0; k<aggregate_sites; k++ ) {
-          // offset used for 2 components of gamma5-symmetry stuff
-          int low_high_offset = 0;
-          for ( k1=0; k1<2; k1++ ) {
-            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-              __m128 phi1_re = _mm_setzero_ps();
-              __m128 phi1_im = _mm_setzero_ps();
-              __m128 phi2_re = _mm_setzero_ps();
-              __m128 phi2_im = _mm_setzero_ps();
-              
-              __m128 operator_re = _mm_load_ps((float *)operator);
-              __m128 operator_im = _mm_load_ps((float *)operator+offset);
-              __m128 phi1_c_re = _mm_load_ps(tmp_phi1_c_re+j+low_high_offset);
-              __m128 phi1_c_im = _mm_load_ps(tmp_phi1_c_im+j+low_high_offset);
-              __m128 phi2_c_re = _mm_load_ps(tmp_phi2_c_re+j+low_high_offset);
-              __m128 phi2_c_im = _mm_load_ps(tmp_phi2_c_im+j+low_high_offset);
-              
-              cfmadd(operator_re, operator_im, phi1_c_re, phi1_c_im, &phi1_re, &phi1_im);
-              cfmadd(operator_re, operator_im, phi2_c_re, phi2_c_im, &phi2_re, &phi2_im);
-              
-              // skip to next real line of matrix
-              operator += offset;
-              // horizontal sum for phi
-              __m128 tmp;
-              tmp = _mm_add_ps( phi1_re, _mm_movehl_ps( phi1_re, phi1_re ) );
-              phi1_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              tmp = _mm_add_ps( phi1_im, _mm_movehl_ps( phi1_im, phi1_im ) );
-              phi1_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-
-              tmp = _mm_add_ps( phi2_re, _mm_movehl_ps( phi2_re, phi2_re ) );
-              phi2_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              tmp = _mm_add_ps( phi2_im, _mm_movehl_ps( phi2_im, phi2_im ) );
-              phi2_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              __m128 tmp1;
-              __m128 tmp2;
-              tmp1 = _mm_set1_ps(((float *)phi_pt)[0]);
-              tmp2 = _mm_set1_ps(((float *)phi_pt)[1]);
-              phi1_re = _mm_add_ps(phi1_re, tmp1);
-              phi1_im = _mm_add_ps(phi1_im, tmp2);
-              tmp1 = _mm_set1_ps(((float *)phi_pt)[0+2*num_parent_eig_vect]);
-              tmp2 = _mm_set1_ps(((float *)phi_pt)[1+2*num_parent_eig_vect]);
-              phi2_re = _mm_add_ps(phi2_re, tmp1);
-              phi2_im = _mm_add_ps(phi2_im, tmp2);
-              _mm_store_ss( (float*)phi_pt, phi1_re );
-              _mm_store_ss( ((float*)phi_pt)+1, phi1_im );
-              _mm_store_ss( (float*)phi_pt+2*num_parent_eig_vect, phi2_re );
-              _mm_store_ss( ((float*)phi_pt)+2*num_parent_eig_vect+1, phi2_im );
-              phi_pt++;
-            }
-            phi_pt+=num_parent_eig_vect;
-            low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-          }
-        }
-      }
-    }
-  else
-#endif  
-    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-      float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-      __m128 zero =  _mm_setzero_ps();
-      for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
-        _mm_store_ps(tmp_phi_c_re+j, zero);
-        _mm_store_ps(tmp_phi_c_im+j, zero);
-      }
-      // copy phi_c into temporary
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi_c_re[j] = creal(phi_c_pt[j]);
-        tmp_phi_c_im[j] = cimag(phi_c_pt[j]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+num_eig_vect]);
-        tmp_phi_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+num_eig_vect]);
-      }
-      
-      int offset = SIMD_LENGTH_PRECISION;
-      // loop over blocks of SIMD_LENGTH_PRECISION vectors
-      for ( j=0; j<num_eig_vect; j+=offset ) {
-        phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-        operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
-        
-        for ( k=0; k<aggregate_sites; k++ ) {
-          // offset used for 2 components of gamma5-symmetry stuff
-          int low_high_offset = 0;
-          for ( k1=0; k1<2; k1++ ) {
-            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-              __m128 phi_re = _mm_setzero_ps();
-              __m128 phi_im = _mm_setzero_ps();
-              
-              __m128 operator_re = _mm_load_ps((float *)operator);
-              __m128 operator_im = _mm_load_ps((float *)operator+offset);
-              __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+j+low_high_offset);
-              __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+j+low_high_offset);
-              
-              cfmadd(operator_re, operator_im, phi_c_re, phi_c_im, &phi_re, &phi_im);
-              
-              // skip to next real line of matrix
-              operator += offset;
-              // horizontal sum for phi
-              __m128 tmp;
-              tmp = _mm_add_ps( phi_re, _mm_movehl_ps( phi_re, phi_re ) );
-              phi_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              tmp = _mm_add_ps( phi_im, _mm_movehl_ps( phi_im, phi_im ) );
-              phi_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              __m128 tmp1; tmp1 = _mm_set1_ps(((float *)phi_pt)[0]);
-              __m128 tmp2; tmp2 = _mm_set1_ps(((float *)phi_pt)[1]);
-              phi_re = _mm_add_ps(phi_re, tmp1);
-              phi_im = _mm_add_ps(phi_im, tmp2);
-              _mm_store_ss( (float*)phi_pt, phi_re );
-              _mm_store_ss( ((float*)phi_pt)+1, phi_im );
-              phi_pt++;
-            }
-            low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-          }
-        }
-      }
-    }
-    
-  PROF_PRECISION_STOP( _PR, 1, threading );
-
-  SYNC_HYPERTHREADS(threading)
-}
-
-
-
-void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-      num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-  
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-
-#ifdef HAVE_TM1p1
-  if( g.n_flavours==2 )
-    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
-
-      float tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-      __m128 zero =  _mm_setzero_ps();
-      for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
-        _mm_store_ps(tmp_phi1_c_re+j, zero);
-        _mm_store_ps(tmp_phi1_c_im+j, zero);
-        _mm_store_ps(tmp_phi2_c_re+j, zero);
-        _mm_store_ps(tmp_phi2_c_im+j, zero);
-      }
-      // copy phi_c into temporary
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi1_c_re[j] = creal(phi_c_pt[j]);
-        tmp_phi1_c_im[j] = cimag(phi_c_pt[j]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi2_c_re[j] = creal(phi_c_pt[j+num_eig_vect]);
-        tmp_phi2_c_im[j] = cimag(phi_c_pt[j+num_eig_vect]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi1_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+2*num_eig_vect]);
-        tmp_phi1_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+2*num_eig_vect]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi2_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+3*num_eig_vect]);
-        tmp_phi2_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+3*num_eig_vect]);
-      }
-      
-      int offset = SIMD_LENGTH_PRECISION;
-      // loop over blocks of SIMD_LENGTH_PRECISION vectors
-      for ( j=0; j<num_eig_vect; j+=offset ) {
-        phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
-        operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites;
-        
-        for ( k=0; k<aggregate_sites; k++ ) {
-          // offset used for 2 components of gamma5-symmetry stuff
-          int low_high_offset = 0;
-          for ( k1=0; k1<2; k1++ ) {
-            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-              __m128 phi1_re = _mm_setzero_ps();
-              __m128 phi1_im = _mm_setzero_ps();
-              __m128 phi2_re = _mm_setzero_ps();
-              __m128 phi2_im = _mm_setzero_ps();
-              
-              __m128 operator_re = _mm_load_ps((float *)operator);
-              __m128 operator_im = _mm_load_ps((float *)operator+offset);
-              __m128 phi1_c_re = _mm_load_ps(tmp_phi1_c_re+j+low_high_offset);
-              __m128 phi1_c_im = _mm_load_ps(tmp_phi1_c_im+j+low_high_offset);
-              __m128 phi2_c_re = _mm_load_ps(tmp_phi2_c_re+j+low_high_offset);
-              __m128 phi2_c_im = _mm_load_ps(tmp_phi2_c_im+j+low_high_offset);
-              
-              cfmadd(operator_re, operator_im, phi1_c_re, phi1_c_im, &phi1_re, &phi1_im);
-              cfmadd(operator_re, operator_im, phi2_c_re, phi2_c_im, &phi2_re, &phi2_im);
-              
-              // skip to next real line of matrix
-              operator += offset;
-              // horizontal sum for phi            
-              __m128 tmp;
-              tmp = _mm_add_ps( phi1_re, _mm_movehl_ps( phi1_re, phi1_re ) );
-              phi1_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              tmp = _mm_add_ps( phi1_im, _mm_movehl_ps( phi1_im, phi1_im ) );
-              phi1_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-
-              tmp = _mm_add_ps( phi2_re, _mm_movehl_ps( phi2_re, phi2_re ) );
-              phi2_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              tmp = _mm_add_ps( phi2_im, _mm_movehl_ps( phi2_im, phi2_im ) );
-              phi2_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              if ( j!= 0 ) {
-                __m128 tmp1;
-                __m128 tmp2;
-                tmp1 = _mm_set1_ps(((float *)phi_pt)[0]);
-                tmp2 = _mm_set1_ps(((float *)phi_pt)[1]);
-                phi1_re = _mm_add_ps(phi1_re, tmp1);
-                phi1_im = _mm_add_ps(phi1_im, tmp2);
-                tmp1 = _mm_set1_ps(((float *)phi_pt)[0+2*num_parent_eig_vect]);
-                tmp2 = _mm_set1_ps(((float *)phi_pt)[1+2*num_parent_eig_vect]);
-                phi2_re = _mm_add_ps(phi2_re, tmp1);
-                phi2_im = _mm_add_ps(phi2_im, tmp2);
-              }
-              _mm_store_ss( (float*)phi_pt, phi1_re );
-              _mm_store_ss( ((float*)phi_pt)+1, phi1_im );
-              _mm_store_ss( (float*)phi_pt+2*num_parent_eig_vect, phi2_re );
-              _mm_store_ss( ((float*)phi_pt)+2*num_parent_eig_vect+1, phi2_im );
-              phi_pt++;
-            }
-            phi_pt+=num_parent_eig_vect;
-            low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-          }
-        }
-      }
-    }
-  else
-#endif  
-    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-      float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-      float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-      __m128 zero =  _mm_setzero_ps();
-      for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
-        _mm_store_ps(tmp_phi_c_re+j, zero);
-        _mm_store_ps(tmp_phi_c_im+j, zero);
-      }
-      // copy phi_c into temporary
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi_c_re[j] = creal(phi_c_pt[j]);
-        tmp_phi_c_im[j] = cimag(phi_c_pt[j]);
-      }
-      for ( j=0; j<num_eig_vect; j++ ) {
-        tmp_phi_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+num_eig_vect]);
-        tmp_phi_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+num_eig_vect]);
-      }
-      
-      int offset = SIMD_LENGTH_PRECISION;
-      // loop over blocks of SIMD_LENGTH_PRECISION vectors
-      for ( j=0; j<num_eig_vect; j+=offset ) {
-        phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-        operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
-        
-        for ( k=0; k<aggregate_sites; k++ ) {
-          // offset used for 2 components of gamma5-symmetry stuff
-          int low_high_offset = 0;
-          for ( k1=0; k1<2; k1++ ) {
-            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-              __m128 phi_re = _mm_setzero_ps();
-              __m128 phi_im = _mm_setzero_ps();
-              
-              __m128 operator_re = _mm_load_ps((float *)operator);
-              __m128 operator_im = _mm_load_ps((float *)operator+offset);
-              __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+j+low_high_offset);
-              __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+j+low_high_offset);
-              
-              cfmadd(operator_re, operator_im, phi_c_re, phi_c_im, &phi_re, &phi_im);
-              
-              // skip to next real line of matrix
-              operator += offset;
-              // horizontal sum for phi            
-              __m128 tmp;
-              tmp = _mm_add_ps( phi_re, _mm_movehl_ps( phi_re, phi_re ) );
-              phi_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              tmp = _mm_add_ps( phi_im, _mm_movehl_ps( phi_im, phi_im ) );
-              phi_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-              
-              if ( j!= 0 ) {
-                __m128 tmp1; tmp1 = _mm_set1_ps(((float *)phi_pt)[0]);
-                __m128 tmp2; tmp2 = _mm_set1_ps(((float *)phi_pt)[1]);
-                phi_re = _mm_add_ps(phi_re, tmp1);
-                phi_im = _mm_add_ps(phi_im, tmp2);
-              }
-              _mm_store_ss( (float*)phi_pt, phi_re );
-              _mm_store_ss( ((float*)phi_pt)+1, phi_im );
-              phi_pt++;
-            }
-            low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-          }
-        }
-      }
-    }
-  
-  PROF_PRECISION_STOP( _PR, 1, threading );
-
-  SYNC_HYPERTHREADS(threading)
-}
-
-
-void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
-  
-  SYNC_CORES(threading)
-  SYNC_HYPERTHREADS(threading)
-
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-    num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-
-#ifdef HAVE_TM1p1
-  if( g.n_flavours==2 )
-    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      
-      int offset = SIMD_LENGTH_PRECISION;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
-      
-      // loop over blocks of SIMD_LENGTH_PRECISION vectors
-      for ( j=0; j<num_eig_vect; j+=offset ) {
-        phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
-        operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites;
-        
-        // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving
-        // complex components and masking
-        // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator)
-        float tmp_phi1_c_re[2*offset];
-        float tmp_phi1_c_im[2*offset];
-        float tmp_phi2_c_re[2*offset];
-        float tmp_phi2_c_im[2*offset];
-        __m128 zero =  _mm_setzero_ps();
-        for ( k1=0; k1<2*offset; k1+=offset ) {
-          _mm_store_ps(tmp_phi1_c_re+k1, zero);
-          _mm_store_ps(tmp_phi1_c_im+k1, zero);
-          _mm_store_ps(tmp_phi2_c_re+k1, zero);
-          _mm_store_ps(tmp_phi2_c_im+k1, zero);
-        }
-        
-        for ( k=0; k<aggregate_sites; k++ ) {
-          // offset used for 2 components of gamma5-symmetry stuff
-          int low_high_offset = 0;
-          for ( k1=0; k1<2; k1++ ) {
-            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-            // phi is the same for all eigenvectors -> broadcast
-              __m128 phi1_re = _mm_set1_ps(((float *)phi_pt)[0]);
-              __m128 phi1_im = _mm_set1_ps(((float *)phi_pt)[1]);
-              __m128 phi2_re = _mm_set1_ps(((float *)phi_pt)[0+2*num_parent_eig_vect]);
-              __m128 phi2_im = _mm_set1_ps(((float *)phi_pt)[1+2*num_parent_eig_vect]);
-
-              __m128 operator_re = _mm_load_ps((float *)operator);
-              __m128 operator_im = _mm_load_ps((float *)operator+offset);
-              __m128 phi1_c_re = _mm_load_ps(tmp_phi1_c_re+low_high_offset);
-              __m128 phi1_c_im = _mm_load_ps(tmp_phi1_c_im+low_high_offset);
-              __m128 phi2_c_re = _mm_load_ps(tmp_phi2_c_re+low_high_offset);
-              __m128 phi2_c_im = _mm_load_ps(tmp_phi2_c_im+low_high_offset);
-              
-              cfmadd_conj(operator_re, operator_im, phi1_re, phi1_im, &phi1_c_re, &phi1_c_im);
-              cfmadd_conj(operator_re, operator_im, phi2_re, phi2_im, &phi2_c_re, &phi2_c_im);
-              
-              _mm_store_ps(tmp_phi1_c_re+low_high_offset, phi1_c_re);
-              _mm_store_ps(tmp_phi1_c_im+low_high_offset, phi1_c_im);
-              _mm_store_ps(tmp_phi2_c_re+low_high_offset, phi2_c_re);
-              _mm_store_ps(tmp_phi2_c_im+low_high_offset, phi2_c_im);
-              // skip to next real line of matrix
-              operator += offset;
-              phi_pt++;
-            }
-            phi_pt += num_parent_eig_vect;
-            low_high_offset = offset;
-          }
-        }
-        
-        for ( int m=0; m<offset; m++ ) {
-          if ( m+j >= num_eig_vect ) break;
-          ((float*)(phi_c_pt+j+m))[0] = tmp_phi1_c_re[m];
-          ((float*)(phi_c_pt+j+m))[1] = tmp_phi1_c_im[m];
-        }
-        for ( int m=0; m<offset; m++ ) {
-          if ( m+j >= num_eig_vect ) break;
-          ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi2_c_re[m];
-          ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi2_c_im[m];
-        }
-        for ( int m=0; m<offset; m++ ) {
-          if ( m+j >= num_eig_vect ) break;
-          ((float*)(phi_c_pt+2*num_eig_vect+j+m))[0] = tmp_phi1_c_re[m+offset];
-          ((float*)(phi_c_pt+2*num_eig_vect+j+m))[1] = tmp_phi1_c_im[m+offset];
-        }
-        for ( int m=0; m<offset; m++ ) {
-          if ( m+j >= num_eig_vect ) break;
-          ((float*)(phi_c_pt+3*num_eig_vect+j+m))[0] = tmp_phi2_c_re[m+offset];
-          ((float*)(phi_c_pt+3*num_eig_vect+j+m))[1] = tmp_phi2_c_im[m+offset];
-        }
-      }
-    }
-  else
-#endif  
-    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-      
-      int offset = SIMD_LENGTH_PRECISION;
-      // loop over blocks of SIMD_LENGTH_PRECISION vectors
-      for ( j=0; j<num_eig_vect; j+=offset ) {
-        phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-        phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-        operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
-        
-        // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving
-        // complex components and masking
-        // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator)
-        float tmp_phi_c_re[2*offset];
-        float tmp_phi_c_im[2*offset];
-        __m128 zero =  _mm_setzero_ps();
-        for ( k1=0; k1<2*offset; k1+=offset ) {
-          _mm_store_ps(tmp_phi_c_re+k1, zero);
-          _mm_store_ps(tmp_phi_c_im+k1, zero);
-        }
-        
-        for ( k=0; k<aggregate_sites; k++ ) {
-          // offset used for 2 components of gamma5-symmetry stuff
-          int low_high_offset = 0;
-          for ( k1=0; k1<2; k1++ ) {
-            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-            // phi is the same for all eigenvectors -> broadcast
-              __m128 phi_re = _mm_set1_ps(((float *)phi_pt)[0]);
-              __m128 phi_im = _mm_set1_ps(((float *)phi_pt)[1]);
-
-              __m128 operator_re = _mm_load_ps((float *)operator);
-              __m128 operator_im = _mm_load_ps((float *)operator+offset);
-              __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+low_high_offset);
-              __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+low_high_offset);
-              
-              cfmadd_conj(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im);
-              
-              _mm_store_ps(tmp_phi_c_re+low_high_offset, phi_c_re);
-              _mm_store_ps(tmp_phi_c_im+low_high_offset, phi_c_im);
-              // skip to next real line of matrix
-              operator += offset;
-              phi_pt++;
-            }
-            low_high_offset = offset;
-          }
-        }
-        
-        for ( int m=0; m<offset; m++ ) {
-          if ( m+j >= num_eig_vect ) break;
-          ((float*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m];
-          ((float*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m];
-        }
-        
-        for ( int m=0; m<offset; m++ ) {
-          if ( m+j >= num_eig_vect ) break;
-          ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset];
-          ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset];
-        }
-      }
-    }
-  
-  SYNC_HYPERTHREADS(threading)
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level );
-  END_LOCKED_MASTER(threading)
-  PROF_PRECISION_STOP( _PR, 1, threading );
-}
-
-#endif // defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION )
diff --git a/src/sse_interpolation_generic.h b/src/sse_interpolation_generic.h
deleted file mode 100644
index 2db7a86..0000000
--- a/src/sse_interpolation_generic.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_INTERPOLATION_PRECISION_HEADER
-  #define SSE_INTERPOLATION_PRECISION_HEADER  
-  
-  #ifdef SSE
-  void interpolation_PRECISION_alloc( level_struct *l );
-  void interpolation_PRECISION_free( level_struct *l );
-  void interpolation_PRECISION_dummy_alloc( level_struct *l );
-  void interpolation_PRECISION_dummy_free( level_struct *l );
-  
-  void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading );
-  void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading );
-  void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading );
-#endif
-  
-#endif
\ No newline at end of file
diff --git a/src/sse_linalg.c b/src/sse_linalg.c
deleted file mode 100644
index bf0f9d6..0000000
--- a/src/sse_linalg.c
+++ /dev/null
@@ -1,795 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#ifdef OPTIMIZED_LINALG_double
-void vector_double_scale( vector_double z, vector_double x, complex_double alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-  PROF_double_START( _LA6 );
-  
-  __m128d alpha_re = _mm_set1_pd( creal_double(alpha) );
-  __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) );
-  double *zd = (double*)(z+start);
-  double *xd = (double*)(x+start);
-  
-  for( int i=start; i<end; ) {
-    FOR6(
-      {
-        __m128d z_re; __m128d z_im;
-        __m128d x_re; __m128d x_im;
-        sse_complex_deinterleaved_load_pd( xd, &x_re, &x_im );
-        
-        cmul_pd( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
-        
-        sse_complex_interleaved_store_pd( z_re, z_im, zd );
-        zd += SIMD_LENGTH_double*2;
-        xd += SIMD_LENGTH_double*2;
-        i += SIMD_LENGTH_double;
-      }
-    )
-  }
-  
-  if(thread == 0 && start != end)
-  PROF_double_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void vector_float_scale( vector_float z, vector_float x, complex_float alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-  PROF_float_START( _LA6 );
-  
-  __m128 alpha_re = _mm_set1_ps( creal_float(alpha) );
-  __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) );
-  float *zf = (float*)(z+start);
-  float *xf = (float*)(x+start);
-  
-  if ( l->depth == 0 ) {
-    for( int i=start; i<end; ) {
-      FOR3(
-        {
-          __m128 z_re; __m128 z_im;
-          __m128 x_re; __m128 x_im;
-          sse_complex_deinterleaved_load( xf, &x_re, &x_im );
-          
-          cmul( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
-          
-          sse_complex_interleaved_store( z_re, z_im, zf );
-          zf += SIMD_LENGTH_float*2;
-          xf += SIMD_LENGTH_float*2;
-          i += SIMD_LENGTH_float;
-        }
-      )
-    }
-  } else {
-    for( int i=start; i<end; ) {
-      __m128 z_re; __m128 z_im;
-      __m128 x_re; __m128 x_im;
-      sse_complex_deinterleaved_load( xf, &x_re, &x_im );
-      
-      cmul( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
-      
-      sse_complex_interleaved_store( z_re, z_im, zf );
-      zf += SIMD_LENGTH_float*2;
-      xf += SIMD_LENGTH_float*2;
-      i += SIMD_LENGTH_float;
-    }
-  }
-  
-  if(thread == 0 && start != end)
-  PROF_float_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void vector_float_saxpy( vector_float z, vector_float x, vector_float y, complex_float alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_float_START( _LA8 );
-  
-  __m128 alpha_re = _mm_set1_ps( creal_float(alpha) );
-  __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) );
-  
-  if ( l->depth == 0 ) {
-    for ( int i=start; i<end; ) {
-      FOR3(
-        {
-          __m128 x_re; __m128 x_im; __m128 y_re; __m128 y_im;
-          sse_complex_deinterleaved_load( (float*)(x+i), &x_re, &x_im );
-          sse_complex_deinterleaved_load( (float*)(y+i), &y_re, &y_im );
-          cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
-          sse_complex_interleaved_store( x_re, x_im, (float*)(z+i) );
-          i+=SIMD_LENGTH_float;
-        }
-      )
-    }
-  } else {
-    for ( int i=start; i<end; ) {
-      __m128 x_re; __m128 x_im; __m128 y_re; __m128 y_im;
-      sse_complex_deinterleaved_load( (float*)(x+i), &x_re, &x_im );
-      sse_complex_deinterleaved_load( (float*)(y+i), &y_re, &y_im );
-      cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
-      sse_complex_interleaved_store( x_re, x_im, (float*)(z+i) );
-      i+=SIMD_LENGTH_float;
-    }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_float_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-void vector_double_saxpy( vector_double z, vector_double x, vector_double y, complex_double alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_double_START( _LA8 );
-  
-  __m128d alpha_re = _mm_set1_pd( creal_double(alpha) );
-  __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) );
-  
-  for ( int i=start; i<end; ) {
-    FOR6(
-      {
-        __m128d x_re; __m128d x_im; __m128d y_re; __m128d y_im;
-        sse_complex_deinterleaved_load_pd( (double*)(x+i), &x_re, &x_im );
-        sse_complex_deinterleaved_load_pd( (double*)(y+i), &y_re, &y_im );
-        cfmadd_pd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
-        sse_complex_interleaved_store_pd( x_re, x_im, (double*)(z+i) );
-        i+=SIMD_LENGTH_double;
-      }
-    )
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_double_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-complex_double global_inner_product_double( vector_double phi, vector_double psi, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_double_START( _GIP, threading );
-  complex_double local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  __m128d alpha_re = _mm_setzero_pd();
-  __m128d alpha_im = _mm_setzero_pd();
-  
-  if ( l->depth == 0 ) {
-    for( int i=thread_start; i<thread_end; ) {
-      FOR3(
-        {
-          __m128d phi_re; __m128d phi_im;
-          __m128d psi_re; __m128d psi_im;
-          sse_complex_deinterleaved_load_pd( (double*)(phi+i), &phi_re, &phi_im );
-          sse_complex_deinterleaved_load_pd( (double*)(psi+i), &psi_re, &psi_im );
-          cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-          i+=SIMD_LENGTH_double;
-        }
-      )
-    }
-  } else {
-    for( int i=thread_start; i<thread_end; ) {
-      __m128d phi_re; __m128d phi_im;
-      __m128d psi_re; __m128d psi_im;
-      sse_complex_deinterleaved_load_pd( (double*)(phi+i), &phi_re, &phi_im );
-      sse_complex_deinterleaved_load_pd( (double*)(psi+i), &psi_re, &psi_im );
-      cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-      i+=SIMD_LENGTH_double;
-    }
-  }
-  
-  local_alpha = sse_reduce_add_pd( alpha_re ) + I* sse_reduce_add_pd( alpha_im );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((complex_double *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((complex_double *)threading->workspace)[0] += ((complex_double *)threading->workspace)[i];
-  local_alpha = ((complex_double *)threading->workspace)[0];
-  END_MASTER(threading)
-  
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_double_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm );
-    PROF_double_STOP( _ALLR, 1 );
-    ((complex_double *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((complex_double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return global_alpha;
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((complex_double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return local_alpha;
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-complex_float global_inner_product_float( vector_float phi, vector_float psi, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_float_START( _GIP, threading );
-  complex_float local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  __m128 alpha_re = _mm_setzero_ps();
-  __m128 alpha_im = _mm_setzero_ps();
-  
-  float *phif = (float*)(phi+thread_start);
-  float *psif = (float*)(psi+thread_start);
-  
-  if ( l->depth == 0 ) {
-    for( int i=thread_start; i<thread_end; ) {
-      FOR3(
-        {
-          __m128 phi_re; __m128 phi_im;
-          __m128 psi_re; __m128 psi_im;
-          sse_complex_deinterleaved_load( phif, &phi_re, &phi_im );
-          sse_complex_deinterleaved_load( psif, &psi_re, &psi_im );
-          cfmadd_conj( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-          phif += 8;
-          psif += 8;
-          i+=SIMD_LENGTH_float;
-        }
-      )
-    }
-  } else {
-    for( int i=thread_start; i<thread_end; ) {
-      __m128 phi_re; __m128 phi_im;
-      __m128 psi_re; __m128 psi_im;
-      sse_complex_deinterleaved_load( phif, &phi_re, &phi_im );
-      sse_complex_deinterleaved_load( psif, &psi_re, &psi_im );
-      cfmadd_conj( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-      phif += 8;
-      psif += 8;
-      i+=SIMD_LENGTH_float;
-    }
-  }
-  
-  local_alpha = sse_reduce_add_ps( alpha_re ) + I* sse_reduce_add_ps( alpha_im );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((complex_float *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((complex_float *)threading->workspace)[0] += ((complex_float *)threading->workspace)[i];
-  local_alpha = ((complex_float *)threading->workspace)[0];
-  END_MASTER(threading)
-  
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_float_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm );
-    PROF_float_STOP( _ALLR, 1 );
-    ((complex_float *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((complex_float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return global_alpha;
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((complex_float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return local_alpha;
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-double global_norm_double( vector_double x, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_double_START( _GIP, threading );
-  
-  double local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += NORM_SQUARE_double(x[i]), i++, l );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((double *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((double *)threading->workspace)[0] += ((double *)threading->workspace)[i];
-  local_alpha = ((double *)threading->workspace)[0];
-  END_MASTER(threading)
-
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_double_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm );
-    PROF_double_STOP( _ALLR, 1 );
-    ((double *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (double)sqrt((double)global_alpha);
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (double)sqrt((double)local_alpha);
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-float global_norm_float( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_float_START( _GIP, threading );
-  
-  float local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  __m128 alpha = _mm_setzero_ps(); 
-  
-  if ( l->depth == 0 ) {       
-    for( int i=thread_start; i<thread_end; ) {
-      FOR6(
-        {
-          __m128 phi = _mm_loadu_ps((float*)(x+i));
-          alpha = sse_fmadd( phi, phi, alpha );
-          i += 2;
-        }
-      )
-    }
-  } else {
-    for( int i=thread_start; i<thread_end; ) {
-      __m128 phi = _mm_loadu_ps((float*)(x+i));
-      alpha = sse_fmadd( phi, phi, alpha );
-      i += 2;
-    }
-  }
-  
-  local_alpha = sse_reduce_add_ps( alpha );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((float *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((float *)threading->workspace)[0] += ((float *)threading->workspace)[i];
-  local_alpha = ((float *)threading->workspace)[0];
-  END_MASTER(threading)
-
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_float_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm );
-    PROF_float_STOP( _ALLR, 1 );
-    ((float *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (float)sqrt((double)global_alpha);
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (float)sqrt((double)local_alpha);
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha,
-                                int sign, int count, int start, int end, level_struct *l ) {
-
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_double_START( _LA8 );
-  
-  int flag = 0;
-  __m128d alpha_re[count]; __m128d alpha_im[count];
-  for ( int c=0; c<count; c++ ) {
-    alpha_re[c] = _mm_set1_pd( sign*creal_double(alpha[c]) );
-    alpha_im[c] = _mm_set1_pd( sign*cimag_double(alpha[c]) );
-    if ( cimag_double(alpha[c]) > EPS_double || -cimag_double(alpha[c]) > EPS_double )
-      flag = 1;
-  }
-  
-  if ( flag == 0 ) {
-    for ( int c=0; c<count; c++ ) {
-      for ( int i=start; i<end; ) {
-        FOR12(
-          {
-            __m128d z_re = _mm_loadu_pd( (double*)(z+i) );
-            __m128d V_re = _mm_loadu_pd( (double*)(V[c]+i) );
-            z_re = sse_fmadd_pd( alpha_re[c], V_re, z_re );
-            _mm_storeu_pd( (double*)(z+i), z_re );
-            i++;
-          }
-        )
-      }
-    }
-  } else {
-    for ( int c=0; c<count; c++ ) {
-      for ( int i=start; i<end; ) {
-        FOR6(
-          {
-            __m128d z_re; __m128d z_im; __m128d V_re; __m128d V_im; 
-            sse_complex_deinterleaved_load_pd( (double*)(z+i), &z_re, &z_im );
-            sse_complex_deinterleaved_load_pd( (double*)(V[c]+i), &V_re, &V_im );
-            cfmadd_pd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im);
-            sse_complex_interleaved_store_pd( z_re, z_im, (double*)(z+i) );
-            i += SIMD_LENGTH_double;
-          }
-        )
-      }
-    }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_double_STOP( _LA8, (double)(count) );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha,
-                               int sign, int count, int start, int end, level_struct *l ) {
-
-  __m128 V_re; __m128 V_im;
-  __m128 z_re; __m128 z_im;
-  __m128 alpha_re[count]; __m128 alpha_im[count];
-  int flag = 0;
-  
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_float_START( _LA8 );
-  
-  for ( int c=0; c<count; c++ ) {
-    alpha_re[c] = _mm_set1_ps( sign*creal_float(alpha[c]) );
-    alpha_im[c] = _mm_set1_ps( sign*cimag_float(alpha[c]) );
-    if ( cimag_float(alpha[c]) > EPS_float || -cimag_float(alpha[c]) > EPS_float )
-      flag = 1;
-  }
-  
-  if ( l->depth == 0 ) {
-    if ( flag == 0 ) {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          FOR6(
-            {
-              z_re = _mm_loadu_ps( (float*)(z+i) );
-              V_re = _mm_loadu_ps( (float*)(V[c]+i) );
-              z_re = sse_fmadd( alpha_re[c], V_re, z_re );
-              _mm_storeu_ps( (float*)(z+i), z_re );
-              i+=2;
-            }
-          )
-        }
-      }
-    } else {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          FOR3(
-            {
-              sse_complex_deinterleaved_load( (float*)(z+i), &z_re, &z_im );
-              sse_complex_deinterleaved_load( (float*)(V[c]+i), &V_re, &V_im );
-              cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im);
-              sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) );
-              i+=SIMD_LENGTH_float;
-            }
-          )
-        }
-      }
-    }
-  } else {
-    if ( flag == 0 ) {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          z_re = _mm_loadu_ps( (float*)(z+i) );
-          V_re = _mm_loadu_ps( (float*)(V[c]+i) );
-          z_re = sse_fmadd( alpha_re[c], V_re, z_re );
-          _mm_storeu_ps( (float*)(z+i), z_re );
-          i+=2;
-        }
-      }
-    } else {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          sse_complex_deinterleaved_load( (float*)(z+i), &z_re, &z_im );
-          sse_complex_deinterleaved_load( (float*)(V[c]+i), &V_re, &V_im );
-          cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im);
-          sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) );
-          i+=SIMD_LENGTH_float;
-        }
-      }
-    }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_float_STOP( _LA8, (double)(count) );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi,
-                                     vector_float psi, int start, int end, level_struct *l,
-                                     struct Thread *threading ) {
-
-  PROF_float_START( _PIP, threading );
-  int i;
-  for(int c=0; c<count; c++)
-    results[c] = 0.0;
-
-  int thread_start;
-  int thread_end;
-
-  SYNC_CORES(threading)
-  
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
-  
-  __m128 result_re = _mm_setzero_ps();
-  __m128 result_im = _mm_setzero_ps();
-  
-  for( int c=0; c<count; c++) {
-    for ( i=thread_start; i<thread_end; i+=12 ) {
-      __m128 psi_re; __m128 psi_im;
-      __m128 phi_re; __m128 phi_im;
-      // deinterleave complex numbers into 4 real parts and 4 imag parts
-      sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im );
-      sse_complex_deinterleaved_load( (float*)(phi[c]+i), &phi_re, &phi_im );
-      
-      cmul_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      
-      sse_complex_deinterleaved_load( (float*)(psi+i+4), &psi_re, &psi_im );
-      sse_complex_deinterleaved_load( (float*)(phi[c]+i+4), &phi_re, &phi_im );
-      
-      cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      
-      sse_complex_deinterleaved_load( (float*)(psi+i+8), &psi_re, &psi_im );
-      sse_complex_deinterleaved_load( (float*)(phi[c]+i+8), &phi_re, &phi_im );
-      
-      cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      
-      results[c] += sse_reduce_add_ps(result_re) + I* sse_reduce_add_ps(result_im);
-    }
-  }
-
-  START_NO_HYPERTHREADS(threading)
-  ((complex_double **)threading->workspace)[threading->core] = results;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int c=0; c<count; c++)
-    for(int i=1; i<threading->n_core; i++)
-      ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c];
-  END_MASTER(threading)
-  // all threads need the result of the norm
-  SYNC_MASTER_TO_ALL(threading)
-  for(int c=0; c<count; c++)
-    results[c] = ((complex_double **)threading->workspace)[0][c];
-
-  PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void process_multi_inner_product_float( int count, complex_float *results, vector_float *phi, vector_float psi,
-    int start, int end, level_struct *l, struct Thread *threading ) {
-
-  PROF_float_START( _PIP, threading );
-  int i;
-  for(int c=0; c<count; c++)
-    results[c] = 0.0;
-
-  int thread_start;
-  int thread_end;
-
-  SYNC_CORES(threading)
-  
-  if ( l->depth == 0 ) {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
-    for(int c=0; c<count; c++) {
-      __m128 result_re = _mm_setzero_ps();
-      __m128 result_im = _mm_setzero_ps();
-      for ( i=thread_start; i<thread_end; ) {
-        FOR3(
-          {
-            __m128 phi_re; __m128 phi_im;
-            __m128 psi_re; __m128 psi_im;
-            
-            // deinterleave complex numbers into 4 real parts and 4 imag parts        
-            sse_complex_deinterleaved_load( (float*)(phi[c]+i), &phi_re, &phi_im );
-            sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im );
-
-            cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-            i+=SIMD_LENGTH_float;
-          }
-        )
-      }
-      results[c] += sse_reduce_add_ps(result_re) + I*sse_reduce_add_ps(result_im);
-    }
-  } else {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 4);
-    for(int c=0; c<count; c++) {
-      __m128 result_re = _mm_setzero_ps();
-      __m128 result_im = _mm_setzero_ps();
-      for ( i=thread_start; i<thread_end; i+=SIMD_LENGTH_float ) {
-        __m128 phi_re; __m128 phi_im;
-        __m128 psi_re; __m128 psi_im;
-        
-        // deinterleave complex numbers into 4 real parts and 4 imag parts        
-        sse_complex_deinterleaved_load( (float*)(phi[c]+i), &phi_re, &phi_im );
-        sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im );
-
-        cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      }
-      results[c] += sse_reduce_add_ps(result_re) + I*sse_reduce_add_ps(result_im);
-    }
-  }
-  
-  START_NO_HYPERTHREADS(threading)
-  ((complex_float **)threading->workspace)[threading->core] = results;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int c=0; c<count; c++)
-    for(int i=1; i<threading->n_core; i++)
-      ((complex_float **)threading->workspace)[0][c] += ((complex_float **)threading->workspace)[i][c];
-  END_MASTER(threading)
-  // all threads need the result of the norm
-  SYNC_MASTER_TO_ALL(threading)
-  for(int c=0; c<count; c++)
-    results[c] = ((complex_float **)threading->workspace)[0][c];
-
-  PROF_float_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-void process_multi_inner_product_double( int count, complex_double *results, vector_double *phi, vector_double psi,
-    int start, int end, level_struct *l, struct Thread *threading ) {
-
-  PROF_double_START( _PIP, threading );
-  int i;
-  for(int c=0; c<count; c++)
-    results[c] = 0.0;
-
-  int thread_start;
-  int thread_end;
-  
-  SYNC_CORES(threading)
-
-  if ( l->depth == 0 ) {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
-    for(int c=0; c<count; c++) {
-      __m128d result_re = _mm_setzero_pd();
-      __m128d result_im = _mm_setzero_pd();
-      for ( i=thread_start; i<thread_end; ) {
-        FOR6(
-          {
-            __m128d phi_re; __m128d phi_im;
-            __m128d pdi_re; __m128d pdi_im;
-            
-            // deinterleave complex numbers into 4 real parts and 4 imag parts        
-            sse_complex_deinterleaved_load_pd( (double*)(phi[c]+i), &phi_re, &phi_im );
-            sse_complex_deinterleaved_load_pd( (double*)(psi+i), &pdi_re, &pdi_im );
-
-            cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im);
-            i+=SIMD_LENGTH_double;
-          }
-        )
-      }
-      results[c] += sse_reduce_add_pd(result_re) + I*sse_reduce_add_pd(result_im);
-    }
-  } else {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2);
-    for(int c=0; c<count; c++) {
-      __m128d result_re = _mm_setzero_pd();
-      __m128d result_im = _mm_setzero_pd();
-      for ( i=thread_start; i<thread_end; i+=SIMD_LENGTH_double ) {
-        __m128d phi_re; __m128d phi_im;
-        __m128d pdi_re; __m128d pdi_im;
-        
-        // deinterleave complex numbers into 4 real parts and 4 imag parts        
-        sse_complex_deinterleaved_load_pd( (double*)(phi[c]+i), &phi_re, &phi_im );
-        sse_complex_deinterleaved_load_pd( (double*)(psi+i), &pdi_re, &pdi_im );
-
-        cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im);
-      }
-      results[c] += sse_reduce_add_pd(result_re) + I*sse_reduce_add_pd(result_im);
-    }
-  }
-  
-  START_NO_HYPERTHREADS(threading)
-  ((complex_double **)threading->workspace)[threading->core] = results;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int c=0; c<count; c++)
-    for(int i=1; i<threading->n_core; i++)
-      ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c];
-  END_MASTER(threading)
-  // all threads need the result of the norm
-  SYNC_MASTER_TO_ALL(threading)
-  for(int c=0; c<count; c++)
-    results[c] = ((complex_double **)threading->workspace)[0][c];
-
-  PROF_double_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading );
-}
-#endif
-
-#endif // SSE
-
diff --git a/src/sse_linalg.h b/src/sse_linalg.h
deleted file mode 100644
index cd88fad..0000000
--- a/src/sse_linalg.h
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef LINALG_SSE_H
-#define LINALG_SSE_H
-#ifdef SSE
-
-
-// Standard Gram-Schmidt on aggregates
-static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec,
-    level_struct *l, struct Thread *threading );
-// Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt
-static inline void sse_aggregate_gram_schmidt_block_float( float *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-// used by Block-Gram-Schmidt
-static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-// used by Block-Gram-Schmidt
-static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-
-static inline void sse_aggregate_gram_schmidt_double( complex_double *V, const int num_vec,
-    level_struct *l, struct Thread *threading ) {}
-static inline void sse_aggregate_gram_schmidt_block_double( double *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {}
-static inline void sse_aggregate_block_dot_block_double( double *S, double *U, double *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {}
-static inline void sse_aggregate_block_minus_block_times_dot_double( double *B, double *U, double *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {}
-
-
-static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-
-  PROF_float_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading );
-  SYNC_CORES(threading)
-  SYNC_HYPERTHREADS(threading)
-  long int i, j, k, k1, k2, k3, num_aggregates = l->s_float.num_aggregates,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
-      
-  float *v_pt1;
-  float *v_pt2;
-  float norm1, norm2;
-  float next_norm1;
-  float next_norm2;
-  int ldv = SIMD_LENGTH_float;
-  int V_block_offset = 2*l->vector_size; 
-  
-  for ( j=threading->n_thread*threading->core+threading->thread; j<num_aggregates; j+=threading->n_thread*threading->n_core ) {
-
-    v_pt1 = (float *)V + 0 + j*aggregate_size*2*ldv;
-
-    next_norm1 = 0.0;
-    next_norm2 = 0.0;
-    for ( i=0; i<aggregate_size; ) {
-      for ( k=0; k<offset; k++, i++ ) {
-        float *tmp = v_pt1 + i*2*ldv;
-        next_norm1 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-      }
-      for ( k=0; k<offset; k++, i++ ) {
-        float *tmp = v_pt1 + i*2*ldv;
-        next_norm2 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-      }
-    }
-    for ( k1=0; k1<num_vec; k1++ ) {
-      v_pt1 = (float *)V + (k1/ldv)*V_block_offset*ldv + k1%ldv + j*aggregate_size*2*ldv;
-      v_pt2 = (float *)V + j*aggregate_size*2*ldv;
-
-      norm1 = 1.0/sqrt(next_norm1);
-      norm2 = 1.0/sqrt(next_norm2);
-      next_norm1 = 0.0;
-      next_norm2 = 0.0;
-
-      __m128 alpha1_re[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 alpha1_im[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 alpha2_re[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 alpha2_im[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 v1_re;
-      __m128 v1_im;
-      __m128 v2_re;
-      __m128 v2_im;
-
-      for ( k2=0; k2<num_vec; k2+=SIMD_LENGTH_float ) {
-        alpha1_re[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-        alpha1_im[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-        alpha2_re[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-        alpha2_im[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-      }
-      for ( i=0; i<aggregate_size; ) {
-        // normalize v1 by scaling with previously computed factor
-        // this is fused into this dotp loop, to avoid loading everything twice
-        for ( k=0; k<offset; k++, i++ ) {
-          float *tmp = v_pt1 + i*2*ldv;
-          tmp[0]   *= norm1;
-          tmp[ldv] *= norm1;
-        }
-        for ( k=0; k<offset; k++, i++ ) {
-          float *tmp = v_pt1 + i*2*ldv;
-          tmp[0]   *= norm2;
-          tmp[ldv] *= norm2;
-        }
-        i -= 2*offset;
-        // done normalizing
-
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          for ( k2=0; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            cfmadd_conj(v1_re, v1_im, v2_re, v2_im, &alpha1_re[k2/SIMD_LENGTH_float], &alpha1_im[k2/SIMD_LENGTH_float]);
-          }
-        }
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          for ( k2=0; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            cfmadd_conj(v1_re, v1_im, v2_re, v2_im, &alpha2_re[k2/SIMD_LENGTH_float], &alpha2_im[k2/SIMD_LENGTH_float]);
-          }
-        }
-      }
-
-      for ( i=0; i<aggregate_size; ) {
-        for ( k=0; k<offset; k++, i++ ) {
-          
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-
-          
-          for ( k2=(k1/SIMD_LENGTH_float)*SIMD_LENGTH_float; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            
-            if(k2 < k1+1) {
-              float mask[SIMD_LENGTH_float] __attribute__((aligned(sizeof(float)*SIMD_LENGTH_float)));
-              memset( mask, 255, sizeof(float)*SIMD_LENGTH_float );
-              
-              // emulate storing mask
-              for ( k3=k2; k3<MIN(k1+1,k2+SIMD_LENGTH_float); k3++ ) {
-                memset( mask+k3-k2, 0, sizeof(float) );
-              }
-              
-              __m128 maskreg = _mm_load_ps(mask);
-              
-              masked_cfnmadd(alpha1_re[k2/SIMD_LENGTH_float], alpha1_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im, maskreg );
-            } else {
-              cfnmadd(alpha1_re[k2/SIMD_LENGTH_float], alpha1_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im);
-            }
-             
-            _mm_store_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset, v2_re);
-            _mm_store_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset, v2_im);
-            
-          }
-        }
-        for ( k=0; k<offset; k++, i++ ) {
-          
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          
-          for ( k2=(k1/SIMD_LENGTH_float)*SIMD_LENGTH_float; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-            
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            
-            if(k2 < k1+1) {
-              float mask[SIMD_LENGTH_float] __attribute__((aligned(sizeof(float)*SIMD_LENGTH_float)));
-              memset( mask, 255, sizeof(float)*SIMD_LENGTH_float );
-          
-              // emulate storing mask
-              for ( k3=k2; k3<MIN(k1+1,k2+SIMD_LENGTH_float); k3++ ) {
-                memset( mask+k3-k2, 0, sizeof(float) );
-              }         
-              
-              __m128 maskreg = _mm_load_ps(mask);
-              
-              masked_cfnmadd(alpha2_re[k2/SIMD_LENGTH_float], alpha2_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im, maskreg );
-            } else {
-              cfnmadd(alpha2_re[k2/SIMD_LENGTH_float], alpha2_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im);
-            }
-             
-            _mm_store_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset, v2_re);
-            _mm_store_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset, v2_im);
-            
-          }
-        }
-        // compute norm of v_{k1+1}
-        // this is fused into this axpy loop, to avoid loading everything twice
-        if ( k1+1<num_vec ) {
-          float *v_pt = (float *)V + ((k1+1)/ldv)*V_block_offset*ldv + (k1+1)%ldv + j*aggregate_size*2*ldv;
-          i -= 2*offset;
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt + i*2*ldv;
-            next_norm1 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-          }
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt + i*2*ldv;
-            next_norm2 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-          }
-        }
-        // end compute norm
-      }
-    }
-  }
-  
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-  PROF_float_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading );
-}
-
-static inline void sse_aggregate_gram_schmidt_block_float( float *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-
-  START_NO_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-  int i, j, k, k1, k2, num_aggregates = l->s_float.num_aggregates,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
-
-  float *v_pt1;
-  float *v_pt2;
-  float norm;
-  float next_norm;
-  int ldv = leading_dimension;
-  //offset = 6;
-
-
-  // current thread chooses an aggregate
-  for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) {
-    j = jp/2;
-    int component = jp%2;
-
-
-    v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv;
-
-    next_norm = 0.0;
-
-    // for the whole aggregate
-    for ( i=0; i<aggregate_size; ) {
-
-      // for either the first or the second half of variables
-      // (depending on the value of "component")
-      for ( k=0; k<offset; k++, i++ ) {
-        // data layout contains ldv real parts
-        // and thereafter ldv imag parts
-        float *tmp = v_pt1 + i*2*ldv;
-        // adds square of real part and square of imaginary part to current norm
-        next_norm += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-      }
-      // skip the other half of variables
-      i += offset;
-    } // i loop
-
-    // for all test vectors
-    for ( k1=0; k1<num_vec; k1++ ) {
-      // v_pt1 =  [ component*offset*number of test vectors 
-      //          + current test vector index
-      //          + current aggregate block of test vectors ] * complex
-      //
-      // means: current vector
-      v_pt1 = V + 2*component*offset*ldv + k1 + j*aggregate_size*2*ldv;
-      // v_pt2 =  [ component*offset*number of test vectors
-      //          + current aggregate block of test vectors ] * complex
-      //
-      // means: first vector
-      v_pt2 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv;
-
-      norm = 1.0/sqrt(next_norm);
-      next_norm = 0.0;
-
-      __m128 alpha_re;
-      __m128 alpha_im;
-      __m128 v1_re;
-      __m128 v1_im;
-      __m128 v2_re;
-      __m128 v2_im;
-
-      alpha_re = _mm_setzero_ps();
-      alpha_im = _mm_setzero_ps();
-      for ( i=0; i<aggregate_size; ) {
-        // normalize v1 by scaling with previously computed factor
-        // this is fused into this dotp loop, to avoid loading everything twice
-        {
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt1 + i*2*ldv;
-            tmp[0]   *= norm;
-            tmp[ldv] *= norm;
-          }
-          i += offset;
-          i -= 2*offset;
-        }
-        // done normalizing current vector
-
-        // calculate inner product of v_pt1 and v_pt2
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv);
-          v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv);
-          cfmadd_conj(v1_re, v1_im, v2_re, v2_im, &alpha_re, &alpha_im);
-        }
-        i += offset;
-      } // i loop
-
-      if(k1 == num_vec-1)
-        break; // break k1 loop
-
-      for ( i=0; i<aggregate_size; ) {
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          
-          float buffer[SIMD_LENGTH_float] __attribute__((aligned(sizeof(float)*SIMD_LENGTH_float)));
-          
-          _mm_store_ps( buffer, alpha_re );
-          for ( k2=0; k2<MIN(k1+1,SIMD_LENGTH_float); k2++ )
-            buffer[k2]=0.0;
-          alpha_re = _mm_load_ps(buffer);
-          
-          _mm_store_ps( buffer, alpha_im );
-          for ( k2=0; k2<MIN(k1+1,SIMD_LENGTH_float); k2++ )
-            buffer[k2]=0.0;
-          alpha_im = _mm_load_ps(buffer);
-          
-          v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv);
-          v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv);
-          cfnmadd(alpha_re, alpha_im, v1_re, v1_im, &v2_re, &v2_im);
-          _mm_store_ps(v_pt2 + (2*i+0)*ldv, v2_re);
-          _mm_store_ps(v_pt2 + (2*i+1)*ldv, v2_im);
-        }
-        i += offset;
-        // compute norm of v_{k1+1}
-        // this is fused into this axpy loop, to avoid loading everything twice
-        {
-          float *v_pt = V + 2*component*offset*ldv + k1+1 + j*aggregate_size*2*ldv;
-          i -= 2*offset;
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt + i*2*ldv;
-            next_norm += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-          }
-          i += offset;
-        }
-          // end compute norm
-      } // i loop
-    } // k1 loop
-  } // j loop
-  SYNC_CORES(threading)
-  END_NO_HYPERTHREADS(threading)
-}
-
-static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-
-  START_NO_HYPERTHREADS(threading)
-
-  // we compute S = U^dagger B
-  // U has 16 columns, B has num_vec <= 16 columns
-  // for vectorization it is more efficient to transpose the MM product:
-  // S^T = B^T U^*
-
-  float *Up;
-  float *Bp;
-
-  // factor 2 is for counting spin01 and spin23 aggregates separately
-  int num_aggregates = 2*l->s_float.num_aggregates;
-  int aggregate_size = l->inner_vector_size / num_aggregates;
-  int offset = l->num_lattice_site_var/2;
-
-  for ( int jp=threading->core; jp<num_aggregates; jp+=threading->n_core ) {
-    int j = jp/2;
-    int component = jp%2;
-    // factors 2 are for complex and spin01/23 aggregates
-    Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    __m128 U_re;
-    __m128 U_im;
-    __m128 B_re;
-    __m128 B_im;
-    __m128 S_re[SIMD_LENGTH_float];
-    __m128 S_im[SIMD_LENGTH_float];
-    for( int i=0; i<SIMD_LENGTH_float; i++) {
-      S_re[i] = _mm_setzero_ps();
-      S_im[i] = _mm_setzero_ps();
-    }
-    for ( int i=0; i<aggregate_size; i+=offset ) {
-      for ( int k=0; k<offset; k++ ) {
-        U_re = _mm_load_ps(Up);
-        U_im = _mm_load_ps(Up + leading_dimension);
-        for ( int vec=0; vec<num_vec; vec++ ) {
-          B_re = _mm_set1_ps(Bp[vec]);
-          B_im = _mm_set1_ps(Bp[vec + leading_dimension]);
-          cfmadd_conj(U_re, U_im, B_re, B_im, S_re + vec, S_im + vec);
-        }
-        Bp += 2*leading_dimension;
-        Up += 2*leading_dimension;
-      }
-      Bp += 2*leading_dimension*offset;
-      Up += 2*leading_dimension*offset;
-    }
-    for( int i=0; i<SIMD_LENGTH_float; i++) {
-      _mm_store_ps(S+(2*(SIMD_LENGTH_float*jp+i)+0)*SIMD_LENGTH_float, S_re[i]);
-      _mm_store_ps(S+(2*(SIMD_LENGTH_float*jp+i)+1)*SIMD_LENGTH_float, S_im[i]);
-    }
-    // this stored S^T in row-major format == S in column major
-  }
-
-  END_NO_HYPERTHREADS(threading)
-}
-
-static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-
-  START_NO_HYPERTHREADS(threading)
-
-  // we compute B -= U S
-  // U has 16 columns, B has num_vec <= 16
-
-  float *Up;
-  float *Bp;
-
-  // factor 2 is for counting spin01 and spin23 aggregates separately
-  int num_aggregates = 2*l->s_float.num_aggregates;
-  int aggregate_size = l->inner_vector_size / num_aggregates;
-  int offset = l->num_lattice_site_var/2;
-
-  for ( int jp=threading->core; jp<num_aggregates; jp+=threading->n_core ) {
-    int j = jp/2;
-    int component = jp%2;
-    // factors 2 are for complex and spin01/23 aggregates
-    Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    __m128 U_re;
-    __m128 U_im;
-    __m128 B_re;
-    __m128 B_im;
-    __m128 S_re[SIMD_LENGTH_float];
-    __m128 S_im[SIMD_LENGTH_float];
-    for( int i=0; i<SIMD_LENGTH_float; i++) {
-      S_re[i] = _mm_load_ps(S+(2*(SIMD_LENGTH_float*jp+i)+0)*SIMD_LENGTH_float);
-      S_im[i] = _mm_load_ps(S+(2*(SIMD_LENGTH_float*jp+i)+1)*SIMD_LENGTH_float);
-    }
-    for ( int i=0; i<aggregate_size; i+=offset ) {
-      for ( int k=0; k<offset; k++ ) {
-        U_re = _mm_load_ps(Up);
-        U_im = _mm_load_ps(Up + leading_dimension);
-        for ( int vec=0; vec<num_vec; vec++ ) {
-          cmul(U_re, U_im, S_re[vec], S_im[vec], &B_re, &B_im);
-                    
-          // horizontal add and subtract from Bp
-          __m128 tmp1;
-          __m128 tmp2;
-          
-          tmp1 = _mm_add_ps( B_re, _mm_movehl_ps( B_re, B_re ) );
-          B_re = _mm_add_ss( tmp1, _mm_shuffle_ps( tmp1, tmp1, 1 ) );
-            
-          tmp1 = _mm_add_ps( B_im, _mm_movehl_ps( B_im, B_im ) );
-          B_im = _mm_add_ss( tmp1, _mm_shuffle_ps( tmp1, tmp1, 1 ) );
-          
-          tmp1 = _mm_set1_ps(Bp[vec]);
-          tmp2 = _mm_set1_ps(Bp[vec + leading_dimension]);
-          B_re = _mm_sub_ps(B_re, tmp1);
-          B_im = _mm_sub_ps(B_im, tmp2);
-          
-          _mm_store_ss( Bp+vec, B_re );
-          _mm_store_ss( Bp+vec+leading_dimension, B_im );
-        }
-        Bp += 2*leading_dimension;
-        Up += 2*leading_dimension;
-      }
-      Bp += 2*leading_dimension*offset;
-      Up += 2*leading_dimension*offset;
-    }
-  }
-
-  END_NO_HYPERTHREADS(threading)
-}
-
-#endif // SSE
-#endif
diff --git a/src/sse_linalg_generic.c b/src/sse_linalg_generic.c
deleted file mode 100644
index b913b0b..0000000
--- a/src/sse_linalg_generic.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#include "sse_complex_float_intrinsic.h"
-#include "sse_float_intrinsic.h"
-#include "sse_linalg.h"
-
-void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_gram_schmidt_PRECISION( V, num_vec, l, threading );
-}
-
-
-void aggregate_gram_schmidt_block_PRECISION( PRECISION *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_gram_schmidt_block_PRECISION( V, num_vec, leading_dimension, l, threading );
-}
-
-
-void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-
-  PROF_PRECISION_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading );
-  SYNC_CORES(threading)
-  for ( int i=0; i<num_vec; i+=SIMD_LENGTH_PRECISION ) {
-
-    int vecs = SIMD_LENGTH_PRECISION;
-    if(num_vec-i < SIMD_LENGTH_PRECISION)
-      vecs = num_vec-i;
-
-    for ( int j=0; j<i; j+=SIMD_LENGTH_PRECISION )
-      aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( (PRECISION *)(V + i*l->vector_size),
-          (PRECISION *)(V + j*l->vector_size), vecs, l, threading );
-    aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading );
-  }
-  SYNC_CORES(threading)
-  PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading );
-}
-
-
-void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-
-  // the block version has some optimizations which are correct only on the fine grid
-  if(l->depth == 0)
-    aggregate_block_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading);
-  else
-    aggregate_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading);
-}
-
-
-void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, int num_vec, level_struct *l, struct Thread *threading ) {
-  START_NO_HYPERTHREADS(threading)
-
-  PRECISION *S = NULL;
-  START_LOCKED_MASTER(threading)
-  // factors 2 are for complex and spin01/23 aggregates
-  MALLOC_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64);
-  ((PRECISION **)threading->workspace)[0] = S;
-  END_LOCKED_MASTER(threading)
-  S = ((PRECISION **)threading->workspace)[0];
-
-  aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading);
-  aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading);
-
-  START_LOCKED_MASTER(threading)
-  FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION);
-  END_LOCKED_MASTER(threading)
-
-  END_NO_HYPERTHREADS(threading)
-}
-
-
-void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_block_dot_block_PRECISION( S, U, B, num_vec, leading_dimension, l, threading );
-}
-
-
-void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_block_minus_block_times_dot_PRECISION( B, U, S, num_vec, leading_dimension, l, threading );
-}
-
-#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION
-void setup_gram_schmidt_PRECISION_compute_dots(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading) {
-
-  int thread_start;
-  int thread_end;
-  int cache_block_size = 12*16;
-
-  for(int i=0; i<2*offset; i++)
-    thread_buffer[i] = 0.0;
-
-  SYNC_CORES(threading)
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
-
-  __m128 dot_re[count];
-  __m128 dot_im[count];
-  __m128 dot_gamma5_re[count];
-  __m128 dot_gamma5_im[count];
-
-  for ( int j=0; j<count; j++) {
-    dot_re[j] = _mm_setzero_ps();
-    dot_im[j] = _mm_setzero_ps();
-    dot_gamma5_re[j] = _mm_setzero_ps();
-    dot_gamma5_im[j] = _mm_setzero_ps();
-  }
-
-  for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    for ( int j=0; j<count; j++ ) {
-      for ( int k=0; k<cache_block_size; k+=12) {
-        __m128 vj_re;
-        __m128 vj_im;
-        __m128 v_re;
-        __m128 v_im;
-        __m128 gamma5_v_re;
-        __m128 gamma5_v_im;
-
-        // gamma5 multiplies the first 6 out of 12 components with -1
-        // SIMD_LENGTH is 4, so the pattern repeats after 12 elements = 3 cachelines
-        // => can use 3 pre-defined +/-1 patterns
-        __m128 gamma5[3];
-        gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 );
-        gamma5[1] = _mm_set_ps(  1.0, 1.0,-1.0,-1.0 );
-        gamma5[2] = _mm_set_ps(  1.0, 1.0, 1.0, 1.0 );
-
-        for(int m=0; m<3; m++) {
-          
-          sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im  );
-          sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im  );
-
-          gamma5_v_re = _mm_mul_ps(gamma5[m], v_re);
-          gamma5_v_im = _mm_mul_ps(gamma5[m], v_im);
-
-          cfmadd_conj(vj_re, vj_im, v_re, v_im, dot_re+j, dot_im+j);
-          cfmadd_conj(vj_re, vj_im, gamma5_v_re, gamma5_v_im, dot_gamma5_re+j, dot_gamma5_im+j);
-        }
-      }
-    }
-  }
-  for ( int j=0; j<count; j++ ) {
-    thread_buffer[j]        = sse_reduce_add_ps(dot_re[j]) + I * sse_reduce_add_ps(dot_im[j]);
-    thread_buffer[j+offset] = sse_reduce_add_ps(dot_gamma5_re[j]) + I * sse_reduce_add_ps(dot_gamma5_im[j]);
-  }
-
-  START_NO_HYPERTHREADS(threading)
-  ((complex_PRECISION **)threading->workspace)[threading->core] = thread_buffer;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++) {
-    for(int j=0; j<count; j++) {
-      ((complex_PRECISION **)threading->workspace)[0][j]        += ((complex_PRECISION **)threading->workspace)[i][j];
-      ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset];
-    }
-  }
-  END_MASTER(threading)
-  // only master needs the result in this case (it will be distributed later)
-}
-#endif
-
-#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION
-void setup_gram_schmidt_PRECISION_axpys(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading) {
-
-  int thread_start;
-  int thread_end;
-  int cache_block_size = 12*16;
-
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
-
-  __m128 dot_re[count];
-  __m128 dot_im[count];
-  __m128 dot_gamma5_re[count];
-  __m128 dot_gamma5_im[count];
-
-  for ( int j=0; j<count; j++) {
-    dot_re[j] = _mm_set1_ps(creal(thread_buffer[2*offset+j]));
-    dot_im[j] = _mm_set1_ps(cimag(thread_buffer[2*offset+j]));
-    dot_gamma5_re[j] = _mm_set1_ps(creal(thread_buffer[3*offset+j]));
-    dot_gamma5_im[j] = _mm_set1_ps(cimag(thread_buffer[3*offset+j]));
-  }
-
-  for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    for ( int j=0; j<count; j++ ) {
-      for ( int k=0; k<cache_block_size; k+=12) {
-        __m128 vj_re;
-        __m128 vj_im;
-        __m128 gamma5_vj_re;
-        __m128 gamma5_vj_im;
-        __m128 v_re;
-        __m128 v_im;
-
-        // gamma5 multiplies the first 6 out of 12 components with -1
-        // SIMD_LENGTH is 4, so the pattern repeats after 12 elements = 3 cachelines
-        // => can use 3 pre-defined +/-1 patterns
-        __m128 gamma5[3];
-        gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 );
-        gamma5[1] = _mm_set_ps(  1.0, 1.0,-1.0,-1.0 );
-        gamma5[2] = _mm_set_ps(  1.0, 1.0, 1.0, 1.0 );
-
-        for(int m=0; m<3; m++) {
-          
-          sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im  );
-          sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im  );
-
-          gamma5_vj_re = _mm_mul_ps(gamma5[m], vj_re);
-          gamma5_vj_im = _mm_mul_ps(gamma5[m], vj_im);
-
-          cfnmadd(vj_re, vj_im, dot_re[j], dot_im[j], &v_re, &v_im);
-          cfnmadd(gamma5_vj_re, gamma5_vj_im, dot_gamma5_re[j], dot_gamma5_im[j], &v_re, &v_im);
-
-          sse_complex_interleaved_store(v_re, v_im, (float*)(V[count]+i+k+4*m) ); 
-        }
-      }
-    }
-  }
-}
-#endif
-
-#endif
diff --git a/src/sse_linalg_generic.h b/src/sse_linalg_generic.h
deleted file mode 100644
index 00390d5..0000000
--- a/src/sse_linalg_generic.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_LINALG_PRECISION_HEADER
-  #define SSE_LINALG_PRECISION_HEADER
-  #ifdef SSE
-  
-  void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  // Block-Gram-Schmidt on aggregates
-  void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  // Standard Gram-Schmidt on aggregates
-  void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  
-  // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt
-  void aggregate_gram_schmidt_block_PRECISION( PRECISION *V,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U,
-      int num_vec, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  
-  void setup_gram_schmidt_PRECISION_compute_dots(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading);
-  
-  void setup_gram_schmidt_PRECISION_axpys(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading);
-  
-#endif
-#endif
\ No newline at end of file
diff --git a/src/threading.c b/src/threading.c
index d793c3a..aa731f9 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -116,12 +116,8 @@ void setup_no_threading(struct Thread *no_threading, struct level_struct *l)
 void compute_core_start_end(int start, int end, int *core_start, int *core_end,
         struct level_struct *l, struct Thread *threading)
 {
-#ifdef SSE
-    int min_per_core = 2*l->num_lattice_site_var;
-#else
     // due to loop unrolling in low level functions
     int min_per_core = 3*40;
-#endif
 //     printf0("min_per_core = %d\n", min_per_core );
     compute_core_start_end_custom(start, end, core_start, core_end, l, threading, min_per_core);
 }
diff --git a/src/top_level.c b/src/top_level.c
index 68fa204..5e04827 100644
--- a/src/top_level.c
+++ b/src/top_level.c
@@ -21,25 +21,29 @@
 
 #include "main.h"
 
-void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) {
+void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading ) {
   
   // no hyperthreading here
   if(threading->thread != 0)
     return;
 
-  int start = threading->start_index[l->depth];
-  int end = threading->end_index[l->depth];
+  //int start = threading->start_index[l->depth];
+  //int end = threading->end_index[l->depth];
 
   if ( g.rhs == 0 ) {
-    vector_double_define( rhs, 1, start, end, l );
+    //vector_double_define( rhs, 1, start, end, l );
+    vector_double_define_new( rhs, 1, l, threading );
     START_MASTER(threading)
     if ( g.print > 0 ) printf0("rhs = ones\n");
     END_MASTER(threading)
   } else if ( g.rhs == 1 )  {
-    vector_double_define( rhs, 0, start, end, l );
+    //vector_double_define( rhs, 0, start, end, l );
+    vector_double_define_new( rhs, 0, l, threading );
     if ( g.my_rank == 0 ) {
       START_LOCKED_MASTER(threading)
-      rhs[0] = 1.0;
+      //rhs->vector_buffer[0] = 1.0;
+      for ( int i=0; i<rhs->num_vect; i++ )
+        rhs->vector_buffer[i*(rhs->size)] = 1.0;
       END_LOCKED_MASTER(threading)
     }
     START_MASTER(threading)
@@ -48,13 +52,15 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading )
   } else if ( g.rhs == 2 ) {
     // this would yield different results if we threaded it, so we don't
     START_LOCKED_MASTER(threading)
-    vector_double_define_random( rhs, 0, l->inner_vector_size, l );
+    //vector_double_define_random( rhs, 0, l->inner_vector_size, l );
+    vector_double_define_random_new( rhs, l, threading );
     END_LOCKED_MASTER(threading)
     START_MASTER(threading)
     if ( g.print > 0 ) printf0("rhs = random\n");
     END_MASTER(threading)
   } else if ( g.rhs == 3 ) {
-    vector_double_define( rhs, 0, start, end, l );
+    //vector_double_define( rhs, 0, start, end, l );
+    vector_double_define_new( rhs, 0, l, threading );
   } else {
     ASSERT( g.rhs >= 0 && g.rhs <= 4 );
   }
@@ -62,10 +68,9 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading )
 }
 
 
-int wilson_driver( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) {
-  
-  int iter = 0, start = threading->start_index[l->depth], end = threading->end_index[l->depth];
+int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) {
   
+  int iter = 0; //, start = threading->start_index[l->depth], end = threading->end_index[l->depth];
   vector_double rhs = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.b:g.p.b;
   vector_double sol = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.x:g.p.x;
 
@@ -78,8 +83,11 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l
   for ( int i=0; i<100; i++ ) {
     double tmp_t = -MPI_Wtime();
 #endif
-  
-  vector_double_copy( rhs, source, start, end, l );  
+  vector_double_change_layout( &sol, &sol, _LV_SV_NV, no_threading );
+  vector_double_change_layout( &rhs, &rhs, _LV_SV_NV, no_threading );
+
+  //vector_double_copy( &rhs, source, start, end, l );
+  vector_double_copy_new( &rhs, source, l, threading );
   if ( g.method == -1 ) {
     cgn_double( &(g.p), l, threading );
   } else if ( g.mixed_precision == 2 ) {
@@ -87,7 +95,9 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l
   } else {
     iter = fgmres_double( &(g.p), l, threading );
   }
-  vector_double_copy( solution, sol, start, end, l );
+  //vector_double_copy( solution, &sol, start, end, l );
+  vector_double_copy_new( solution, &sol, l, threading );
+  
 #ifdef WILSON_BENCHMARK
     tmp_t += MPI_Wtime();
     if ( tmp_t < t_min )
@@ -101,17 +111,21 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l
   END_MASTER(threading)
 #endif
   
+  vector_double_change_layout( &sol, &sol, _NV_LV_SV, no_threading );
+  vector_double_change_layout( &rhs, &rhs, _NV_LV_SV, no_threading );
+
   return iter;
 }
 
 
-void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) {
+void solve( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading ) {
   
   if ( g.vt.evaluation ) {
     vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b;
     // this would yield different results if we threaded it, so we don't
     START_LOCKED_MASTER(threading)
-    vector_double_define_random( rhs, 0, l->inner_vector_size, l );
+    //vector_double_define_random( &rhs, 0, l->inner_vector_size, l );
+    vector_double_define_random_new( &rhs, l, threading );
     scan_var( &(g.vt), l );
     END_LOCKED_MASTER(threading)
   } else {
@@ -122,8 +136,11 @@ void solve( vector_double solution, vector_double source, level_struct *l, struc
 
 void solve_driver( level_struct *l, struct Thread *threading ) {
   
-  vector_double solution = NULL, source = NULL;
-  double minus_twisted_bc[4], norm;
+  vector_double solution, source;
+  double minus_twisted_bc[4], norm[g.num_rhs_vect];
+  
+  vector_double_init( &solution );
+  vector_double_init( &source );
  
   if(g.bc==2)
     for ( int i=0; i<4; i++ )
@@ -135,55 +152,66 @@ void solve_driver( level_struct *l, struct Thread *threading ) {
     printf0("inverting doublet operator\n");
   }
 #endif
-  PUBLIC_MALLOC( solution, complex_double, l->inner_vector_size );
-  PUBLIC_MALLOC( source, complex_double, l->inner_vector_size );
+  vector_double_alloc( &solution, _INNER, g.num_rhs_vect, l, threading );
+  vector_double_alloc( &source, _INNER, g.num_rhs_vect, l, threading );
 
-  rhs_define( source, l, threading );
+  rhs_define( &source, l, threading );
+  
+  vector_double_change_layout( &solution, &solution, _LV_SV_NV, no_threading );
+  vector_double_change_layout( &source, &source, _LV_SV_NV, no_threading );
 
   if(g.bc==2)
-    apply_twisted_bc_to_vector_double( source, source, g.twisted_bc, l);
-
-  norm = global_norm_double( source, 0, l->inner_vector_size, l, threading );
-  printf0("source vector norm: %le\n",norm);
+      apply_twisted_bc_to_vector_double_new( &source, &source, g.twisted_bc, l);
 
+  global_norm_double_new( norm, &source, l, threading );
+  for( int i=0; i<g.num_rhs_vect; i++ ){
+    //norm = global_norm_double( &source, 0, l->inner_vector_size, l, threading );
+    printf0("source vector %d norm: %le\n",i,norm[i]);
+  }
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 1 )
 #endif
 #ifdef HAVE_TM
-  if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
-    if(g.downprop) {
+    if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+      if(g.downprop) {
       
-      START_MASTER(threading)  
-      printf0("\n\n+--------------------------- up ---------------------------+\n\n");
-      END_MASTER(threading)
+	START_MASTER(threading)  
+	  printf0("\n\n+--------------------------- up ---------------------------+\n\n");
+	END_MASTER(threading)
 
-      solve( solution, source, l, threading );    
+	solve( &solution, &source, l, threading );    
       
-      if(g.bc==2)
-     apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l);
+	if(g.bc==2)
+	  apply_twisted_bc_to_vector_double_new( &solution, &solution, minus_twisted_bc, l);
       
-      START_LOCKED_MASTER(threading)  
-      printf0("\n\n+-------------------------- down --------------------------+\n\n");
-      g.mu*=-1;
-      g.mu_odd_shift*=-1;
-      g.mu_even_shift*=-1;
-      END_LOCKED_MASTER(threading)
+	START_LOCKED_MASTER(threading)  
+	  printf0("\n\n+-------------------------- down --------------------------+\n\n");
+	  g.mu*=-1;
+	  g.mu_odd_shift*=-1;
+	  g.mu_even_shift*=-1;
+	END_LOCKED_MASTER(threading)
   
-      tm_term_update( g.mu, l, threading );
-      finalize_operator_update( l, threading );
-    } 
+	tm_term_update( g.mu, l, threading );
+        finalize_operator_update( l, threading );
+      } 
 #endif
 
-  solve( solution, source, l, threading );
+  solve( &solution, &source, l, threading );
 
   if(g.bc==2)
-    apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l);
+    apply_twisted_bc_to_vector_double_new( &solution, &solution, minus_twisted_bc, l);
+ 
+  global_norm_double_new( norm, &solution, l, threading );
+  for( int i=0; i<g.num_rhs_vect; i++ ){
+    //norm = global_norm_double( &solution, 0, l->inner_vector_size, l, threading );
+    printf0("solution vector %d norm: %le\n",i,norm[i]);
+  }
 
-  norm = global_norm_double( solution, 0, l->inner_vector_size, l, threading );
-  printf0("solution vector norm: %le\n",norm);
+  vector_double_change_layout( &solution, &solution, _NV_LV_SV, no_threading );
+  vector_double_change_layout( &source, &source, _NV_LV_SV, no_threading );
 
-  PUBLIC_FREE( solution, complex_double, l->inner_vector_size );
-  PUBLIC_FREE( source, complex_double, l->inner_vector_size );
+  vector_double_free( &solution, l, threading );
+  vector_double_free( &source, l, threading );
 
 #ifdef HAVE_TM1p1
   if( g.n_flavours == 2 ) 
diff --git a/src/top_level.h b/src/top_level.h
index cc4b029..a281daa 100644
--- a/src/top_level.h
+++ b/src/top_level.h
@@ -24,9 +24,9 @@
   
 struct Thread;
   
-  void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading );
-  int wilson_driver( vector_double solution, vector_double source, level_struct *l, struct Thread *threading );
-  void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading );
+  void rhs_define( vector_double *rhs, level_struct *l, struct Thread *threading );
+  int wilson_driver( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading );
+  void solve( vector_double *solution, vector_double *source, level_struct *l, struct Thread *threading );
   void solve_driver( level_struct *l, struct Thread *threading );
   
 #endif
diff --git a/src/var_table.h b/src/var_table.h
index abb321c..dbc6b2a 100644
--- a/src/var_table.h
+++ b/src/var_table.h
@@ -33,18 +33,19 @@
     warning0("SCAN_VAR does not support threading, yet.\n"); \
     kind *tmp_var = (kind*)(var_pt); \
     kind signum = (start_val<end_val)?1:-1; \
-    vector_double v = NULL; \
+    vector_double v; \
+    vector_double_init(&v); \
     double norm_v = 0.0, tt0, tt1; \
     vector_double x = (g.mixed_precision==2)?g.p_MP.dp.x:g.p.x; \
     vector_double b = (g.mixed_precision==2)?g.p_MP.dp.b:g.p.b; \
     tt0 = MPI_Wtime(); \
     \
     if ( g.vt.track_error ) { \
-      MALLOC( v, complex_double, l->inner_vector_size ); \
+      vector_double_alloc( &v, _INNER, 1, l, no_threading ); \
       if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \
       else fgmres_double( &(g.p), l, no_threading ); \
-      vector_double_copy( v, x, 0, l->inner_vector_size, l ); \
-      norm_v = global_norm_double( v, 0, l->inner_vector_size, l, no_threading ); \
+      vector_double_copy( &v, &x, 0, l->inner_vector_size, l ); \
+      norm_v = global_norm_double( &v, 0, l->inner_vector_size, l, no_threading ); \
     } \
     \
     for ( *tmp_var = (kind)start_val; signum*(*tmp_var) <= signum*((kind)end_val) + EPS_double; \
@@ -68,32 +69,32 @@
         } \
         printf0("scanning variable \"%s\", value: %lf, run %d of %d\n", name, (double)(*tmp_var), i+1, g.vt.average_over ); \
         if ( g.vt.track_error ) { \
-          apply_operator_double( b, v, &(g.p), l, no_threading ); \
-          vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+          apply_operator_double( &b, &v, &(g.p), l, no_threading ); \
+          vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \
           if ( g.vt.track_cgn_error ) { \
             ASSERT( g.method >=0 && g.p.restart_length >= 4 ); \
-            vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+            vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \
             cgn_double( &(g.p), l, no_threading ); \
-            vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \
-            g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \
+            vector_double_minus( &x, &x, &v, 0, l->inner_vector_size, l ); \
+            g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \
             printf0("CGN: error norm: %le\n", g.vt.p_end->values[_CGNR_ERR] ); \
-            vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+            vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \
             } \
         } else {\
-          rhs_define( b, l, no_threading );\
+          rhs_define( &b, l, no_threading );\
         } \
-        vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+        vector_double_define( &x, 0, 0, l->inner_vector_size, l ); \
         if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \
         else fgmres_double( &(g.p), l, no_threading ); \
         if ( i == g.vt.average_over-1 ) prof_print( l ); \
         if ( g.vt.track_error ) { \
-          vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \
-          g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \
+          vector_double_minus( &x, &x, &v, 0, l->inner_vector_size, l ); \
+          g.vt.p_end->values[_SLV_ERR] += ( global_norm_double( &x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \
         } \
       } \
     } \
     if ( g.vt.track_error ) { \
-      FREE( v, complex_double, l->inner_vector_size ); \
+      vector_double_free( &v, l, no_threading ); \
     } \
     tt1 = MPI_Wtime(); \
     printf0("\n\ntotal time for parameter scan: %d minutes and %d seconds\n", \
diff --git a/src/vcycle_generic.c b/src/vcycle_generic.c
index 038a8fa..cfabc5b 100644
--- a/src/vcycle_generic.c
+++ b/src/vcycle_generic.c
@@ -22,10 +22,10 @@
 #include "main.h"
 #include "vcycle_PRECISION.h"
 
-void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
+void smoother_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta,
                          int n, const int res, level_struct *l, struct Thread *threading ) {
   
-  ASSERT( phi != eta );
+  ASSERT( phi->vector_buffer != eta->vector_buffer );
 
   START_MASTER(threading);
   PROF_PRECISION_START( _SM );
@@ -47,10 +47,10 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE
     if ( g.method == 4 || g.method == 6 ) {
       if ( g.odd_even ) {
         if ( res == _RES ) {
-          apply_operator_PRECISION( l->sp_PRECISION.x, phi, &(l->p_PRECISION), l, threading );
-          vector_PRECISION_minus( l->sp_PRECISION.x, eta, l->sp_PRECISION.x, start, end, l );
+          apply_operator_PRECISION( &(l->sp_PRECISION.x), phi, &(l->p_PRECISION), l, threading );
+          vector_PRECISION_minus( &(l->sp_PRECISION.x), eta, &(l->sp_PRECISION.x), start, end, l );
         }
-        block_to_oddeven_PRECISION( l->sp_PRECISION.b, res==_RES?l->sp_PRECISION.x:eta, l, threading );
+        block_to_oddeven_PRECISION( &(l->sp_PRECISION.b), res==_RES?&(l->sp_PRECISION.x):eta, l, threading );
         START_LOCKED_MASTER(threading)
         l->sp_PRECISION.initial_guess_zero = _NO_RES;
         END_LOCKED_MASTER(threading)
@@ -62,21 +62,21 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE
           else coarse_solve_odd_even_PRECISION( &(l->sp_PRECISION), &(l->oe_op_PRECISION), l, threading );
         }
         if ( res == _NO_RES ) {
-          oddeven_to_block_PRECISION( phi, l->sp_PRECISION.x, l, threading );
+          oddeven_to_block_PRECISION( phi, &(l->sp_PRECISION.x), l, threading );
         } else {
-          oddeven_to_block_PRECISION( l->sp_PRECISION.b, l->sp_PRECISION.x, l, threading );
-          vector_PRECISION_plus( phi, phi, l->sp_PRECISION.b, start, end, l );
+          oddeven_to_block_PRECISION( &(l->sp_PRECISION.b), &(l->sp_PRECISION.x), l, threading );
+          vector_PRECISION_plus( phi, phi, &(l->sp_PRECISION.b), start, end, l );
         }
       } else {
         START_LOCKED_MASTER(threading)
-        l->sp_PRECISION.x = phi; l->sp_PRECISION.b = eta;
+        l->sp_PRECISION.x = *phi; l->sp_PRECISION.b = *eta;
         END_LOCKED_MASTER(threading)
         fgmres_PRECISION( &(l->sp_PRECISION), l, threading );
       }
     } else if ( g.method == 5 ) {
-      vector_PRECISION_copy( l->sp_PRECISION.b, eta, start, end, l );
+      vector_PRECISION_copy( &(l->sp_PRECISION.b), eta, start, end, l );
       bicgstab_PRECISION( &(l->sp_PRECISION), l, threading );
-      vector_PRECISION_copy( phi, l->sp_PRECISION.x, start, end, l );
+      vector_PRECISION_copy( phi, &(l->sp_PRECISION.x), start, end, l );
     }
     ASSERT( Dphi == NULL );
   }
@@ -87,19 +87,19 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE
 }
 
 
-void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
+void vcycle_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta,
                        int res, level_struct *l, struct Thread *threading ) {
 
   if ( g.interpolation && l->level>0 ) {
     for ( int i=0; i<l->n_cy; i++ ) {
       if ( i==0 && res == _NO_RES ) {
-        restrict_PRECISION( l->next_level->p_PRECISION.b, eta, l, threading );
+        restrict_PRECISION( &(l->next_level->p_PRECISION.b), eta, l, threading );
       } else {
         int start = threading->start_index[l->depth];
         int end   = threading->end_index[l->depth];
-        apply_operator_PRECISION( l->vbuf_PRECISION[2], phi, &(l->p_PRECISION), l, threading );
-        vector_PRECISION_minus( l->vbuf_PRECISION[3], eta, l->vbuf_PRECISION[2], start, end, l );
-        restrict_PRECISION( l->next_level->p_PRECISION.b, l->vbuf_PRECISION[3], l, threading );
+        apply_operator_PRECISION( &(l->vbuf_PRECISION[2]), phi, &(l->p_PRECISION), l, threading );
+        vector_PRECISION_minus( &(l->vbuf_PRECISION[3]), eta, &(l->vbuf_PRECISION[2]), start, end, l );
+        restrict_PRECISION( &(l->next_level->p_PRECISION.b), &(l->vbuf_PRECISION[3]), l, threading );
       }
       if ( !l->next_level->idle ) {
         START_MASTER(threading)
@@ -110,7 +110,7 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI
           if ( g.kcycle )
             fgmres_PRECISION( &(l->next_level->p_PRECISION), l->next_level, threading );
           else
-            vcycle_PRECISION( l->next_level->p_PRECISION.x, NULL, l->next_level->p_PRECISION.b, _NO_RES, l->next_level, threading );
+            vcycle_PRECISION( &(l->next_level->p_PRECISION.x), NULL, &(l->next_level->p_PRECISION.b), _NO_RES, l->next_level, threading );
         } else {
           if ( g.odd_even ) {
             if ( g.method == 6 ) {
@@ -128,9 +128,9 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI
         END_MASTER(threading)
       }
       if( i == 0 && res == _NO_RES )
-        interpolate3_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading );
+        interpolate3_PRECISION( phi, &(l->next_level->p_PRECISION.x), l, threading );
       else
-        interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading );
+        interpolate_PRECISION( phi, &(l->next_level->p_PRECISION.x), l, threading );
       smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, l, threading );
       res = _RES;
     }
diff --git a/src/vcycle_generic.h b/src/vcycle_generic.h
index 5e54a74..8c251f6 100644
--- a/src/vcycle_generic.h
+++ b/src/vcycle_generic.h
@@ -32,10 +32,10 @@
   #include "threading.h"
   #include "solver_analysis.h"
 
-  void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
+  void smoother_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta,
                            int n, const int res, level_struct *l, struct Thread *threading );
     
-  void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
+  void vcycle_PRECISION( vector_PRECISION *phi, vector_PRECISION *Dphi, vector_PRECISION *eta,
                          int res, level_struct *l, struct Thread *threading );
   
 #endif
diff --git a/src/vector_generic.c b/src/vector_generic.c
new file mode 100644
index 0000000..85276c1
--- /dev/null
+++ b/src/vector_generic.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#include "main.h"
+
+void vector_PRECISION_init( vector_PRECISION *vec ) {
+  
+  vec->vector_buffer = NULL;
+}
+
+
+void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, Thread *threading ) {
+
+  switch (type){
+  case _ORDINARY : 
+    PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->vector_size*num_vect );
+    vec->size = l->vector_size;
+    break;
+  case _SCHWARZ : 
+    PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*num_vect );
+    vec->size = l->schwarz_vector_size;
+    break;
+  case _INNER: 
+    PUBLIC_MALLOC( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*num_vect );
+    vec->size = l->inner_vector_size;
+    break;
+  }
+
+  vec->type = type;
+  vec->num_vect = num_vect;
+  vec->layout = _NV_LV_SV;
+  vec->l = l; 
+}
+
+
+void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, struct Thread *threading ) {
+  
+  switch (vec->type){
+  case _ORDINARY : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->vector_size*vec->num_vect );
+    break;
+  case _SCHWARZ : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->schwarz_vector_size*vec->num_vect );
+    break;
+  case _INNER : PUBLIC_FREE( vec->vector_buffer, complex_PRECISION, l->inner_vector_size*vec->num_vect );
+    break;
+  }
+}
+
+
+// vector storage for PRECISION precision
+void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l ) {
+  
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+    PROF_PRECISION_START( _SET );
+  if ( phi->vector_buffer != NULL ) {
+    int i;
+    for ( i=start; i<end; i++ )
+      phi->vector_buffer[i] = value;
+  } else {
+    error0("Error in \"vector_PRECISION_define\": pointer is null\n");
+  }
+  if(thread == 0 && start != end)
+    PROF_PRECISION_STOP( _SET, 1 );
+}
+
+
+void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading ) {
+
+  int start, end;
+  compute_core_start_end(0, (phi->size)*(phi->num_vect), &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0)
+    PROF_PRECISION_START( _SET );
+
+  if ( phi->vector_buffer != NULL ) {
+    int i;
+    for ( i=start; i<end; i++ )
+      phi->vector_buffer[i] = value;
+  } else {
+    error0("Error in \"vector_PRECISION_define\": pointer is null\n");
+  }
+  if(thread == 0)
+    PROF_PRECISION_STOP( _SET, 1 );
+}
+
+
+void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha, 
+                                  int start, int end, level_struct *l ) { 
+   
+  vector_PRECISION_check_comp( z, x );
+  //z->layout = x->layout;
+
+  int thread = omp_get_thread_num(); 
+  if(thread == 0 && start != end) 
+  PROF_PRECISION_START( _RS ); 
+  
+  PRECISION *r_z = (PRECISION*)z->vector_buffer, *r_x = (PRECISION*)x->vector_buffer, r_alpha = creal_PRECISION(alpha);
+  int r_start = 2*start, r_end = 2*end;
+ 
+  REAL_VECTOR_FOR( int i=r_start, i<r_end, r_z[i] = r_alpha*r_x[i], i++, l );
+  
+  if(thread == 0 && start != end) 
+  PROF_PRECISION_STOP( _RS, (double)(end-start)/(double)l->inner_vector_size ); 
+}
+
+
+/*
+ * opt = 0 : z = alpha*x
+ * opt = 1 : z = (1/alpha)*x
+ */
+void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha, 
+                                  int n, int opt, level_struct *l, struct Thread *threading ) { 
+
+  //vector_PRECISION_check_comp( z, x );
+
+  int i, j, jj, start, end;
+  PRECISION r_alpha[x->num_vect];
+
+  if(opt){
+    VECTOR_LOOP(j, x->num_vect, jj, r_alpha[j+jj]=1.0/creal_PRECISION(alpha[n*x->num_vect+j+jj]);)
+  }else{
+    VECTOR_LOOP(j, x->num_vect, jj, r_alpha[j+jj]=creal_PRECISION(alpha[n*x->num_vect+j+jj]);)
+  }
+
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num(); 
+  if(thread == 0 && start != end) 
+  PROF_PRECISION_START( _RS ); 
+
+  //vector_PRECISION_change_layout( x, x, _LV_SV_NV, no_threading );
+  //vector_PRECISION_change_layout( z, z, _LV_SV_NV, no_threading );
+  if(z == x){
+    for( i=start; i<end; i++)
+      VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] *= r_alpha[j+jj];)
+  } else {
+    for( i=start; i<end; i++)
+      VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = r_alpha[j+jj]*x->vector_buffer[i*x->num_vect+j+jj];)
+  }
+  //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading );
+  //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading );
+
+  if(thread == 0 && start != end) 
+  PROF_PRECISION_STOP( _RS, (double)(end-start)/(double)l->inner_vector_size ); 
+}
+
+
+
+void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ) {
+ 
+  if(z == x) return;
+
+  buffer_PRECISION z_pt=z->vector_buffer, x_pt=x->vector_buffer;
+  int thread = omp_get_thread_num();
+  if(thread == 0 && start != end)
+    PROF_PRECISION_START( _CPY );
+    VECTOR_FOR( int i=start, i<end, z_pt[i] = x_pt[i], i++, l );
+
+  if(thread == 0 && start != end)
+    PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size );
+}
+
+
+void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading ) {
+
+  if(z == x) return;
+
+  int i, j, jj, start, end;
+  compute_core_start_end(0, x->size, &start, &end, l, threading);
+  int thread = omp_get_thread_num();
+  if(thread == 0)
+    PROF_PRECISION_START( _CPY );
+  
+  for( i=start; i<end; i++)
+    VECTOR_LOOP(j, x->num_vect, jj, z->vector_buffer[i*x->num_vect+j+jj] = x->vector_buffer[i*x->num_vect+j+jj];)
+  
+  //vector_PRECISION_change_layout( x, x, _NV_LV_SV, no_threading );
+  //vector_PRECISION_change_layout( z, z, _NV_LV_SV, no_threading );
+
+  if(thread == 0)
+    PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size );
+}
+
+
+void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2) {
+
+  if(vec1->num_vect != vec2->num_vect)
+    error0("Error: The number of vectors have to be the same in both vectors\n");
+
+  if(vec1->l->level != vec2->l->level)
+    error0("Error: The level of multigrid must be the same in both vectors\n");
+
+  if(vec1->type != vec2->type)
+    error0("Error: The type must be the same in both vectors\n");
+
+}
+
+
+void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, struct Thread *threading ) {
+  
+  if(vec_in->layout==layout) return;
+ 
+  vector_PRECISION_check_comp( vec_out, vec_in );
+
+  int n, i, sv, lv = 0, num_sv = vec_in->l->num_lattice_site_var;
+  vector_PRECISION vec_tmp;
+
+  if( vec_in->vector_buffer == vec_out->vector_buffer ){
+    vector_PRECISION_init( &vec_tmp );
+    vector_PRECISION_alloc( &vec_tmp, vec_in->type, vec_in->num_vect, vec_in->l, no_threading );
+  } else {
+    vec_tmp = *vec_out;
+  }
+
+  switch (vec_in->type){
+  case _ORDINARY :
+    lv = vec_in->l->num_lattice_sites;
+    break;
+  case _SCHWARZ :
+    lv = 2*vec_in->l->num_lattice_sites - vec_in->l->num_inner_lattice_sites;
+    break;
+  case _INNER:
+    lv = vec_in->l->num_inner_lattice_sites;
+    break;
+  }
+
+  switch (layout){
+  case _NV_LV_SV :
+    for( n=0; n<vec_in->num_vect; n++ )
+      for( i=0; i<lv; i++ )
+        for( sv=0; sv<num_sv; sv++ )
+	  vec_tmp.vector_buffer[INDEX_NV_LV_SV( n, vec_in->num_vect, i, lv, sv, num_sv )] = vec_in->vector_buffer[INDEX_LV_SV_NV( n, vec_in->num_vect, i, lv, sv, num_sv )];
+
+    vec_out->layout = _NV_LV_SV;
+    break;
+  case _LV_SV_NV : 
+    for( i=0; i<lv; i++ )
+      for( sv=0; sv<num_sv; sv++ )
+        for( n=0; n<vec_in->num_vect; n++ )
+          vec_tmp.vector_buffer[INDEX_LV_SV_NV( n, vec_in->num_vect, i, lv, sv, num_sv )] = vec_in->vector_buffer[INDEX_NV_LV_SV( n, vec_in->num_vect, i, lv, sv, num_sv )];
+    
+    vec_out->layout = _LV_SV_NV;
+    break;
+  }
+ 
+  if( vec_in->vector_buffer == vec_out->vector_buffer ){
+     vector_PRECISION_copy( vec_out, &vec_tmp, 0, lv*num_sv*vec_out->num_vect, vec_out->l );
+     vector_PRECISION_free( &vec_tmp, vec_in->l, no_threading ); 
+  }
+
+}
+
+void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading ) {
+
+  PRECISION diff = 0;
+  
+  vector_PRECISION vp[3];
+
+  for(int i=0; i<3; i++){
+    vector_PRECISION_init( &vp[i] );
+    vector_PRECISION_alloc( &vp[i], _ORDINARY, 4, l, threading );
+  }
+  
+  START_LOCKED_MASTER(threading)
+
+  vector_PRECISION_define_random( &vp[0], 0, 4*l->vector_size, l );
+  vector_PRECISION_copy( &vp[1], &vp[0], 0, 4*l->vector_size, l );
+  vector_PRECISION_change_layout( &vp[1], &vp[1], _LV_SV_NV, no_threading );
+  vector_PRECISION_change_layout( &vp[1], &vp[1], _NV_LV_SV, no_threading ); 
+  vector_PRECISION_minus( &vp[2], &vp[1], &vp[0], 0, 4*l->vector_size, l );
+  diff = global_norm_PRECISION( &vp[2], 0, 4*l->vector_size, l, no_threading )/
+    global_norm_PRECISION( &vp[0], 0, 4*l->vector_size, l, no_threading );
+  
+  test0_PRECISION("depth: %d, correctness of vector PRECISION change layout: %le\n", l->depth, diff );
+ 
+  END_LOCKED_MASTER(threading)
+  for(int i=0; i<3; i++){
+    vector_PRECISION_free( &vp[i], l, threading );
+  }
+  if ( l->level == 0 && g.method == 0)
+    return;
+  else
+    vector_PRECISION_test_routine(l->next_level, threading);
+}
diff --git a/src/vector_generic.h b/src/vector_generic.h
new file mode 100644
index 0000000..901e4a2
--- /dev/null
+++ b/src/vector_generic.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef VECTOR_PRECISION_HEADER
+  #define VECTOR_PRECISION_HEADER
+
+  struct Thread;
+  
+  void vector_PRECISION_init( vector_PRECISION *vec );
+  void vector_PRECISION_alloc( vector_PRECISION *vec, const int type, int num_vect, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_free( vector_PRECISION *vec, level_struct *l, Thread *threading);
+  void vector_PRECISION_define( vector_PRECISION *phi, complex_PRECISION value, int start, int end, level_struct *l );
+  void vector_PRECISION_define_new( vector_PRECISION *phi, complex_PRECISION value, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_real_scale( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION alpha,
+                                    int start, int end, level_struct *l );
+  void vector_PRECISION_real_scale_new( vector_PRECISION *z, vector_PRECISION *x, complex_PRECISION *alpha,
+                                  int n, int opt, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_copy( vector_PRECISION *z, vector_PRECISION *x, int start, int end, level_struct *l ); // z := x
+  void vector_PRECISION_copy_new( vector_PRECISION *z, vector_PRECISION *x, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_check_comp( vector_PRECISION *vec1, vector_PRECISION *vec2 );
+  void vector_PRECISION_change_layout( vector_PRECISION *vec_out, vector_PRECISION *vec_in, const int layout, struct Thread *threading );
+  void vector_PRECISION_test_routine( level_struct *l, struct Thread *threading );
+  
+#endif
diff --git a/src/vectorization_control.h b/src/vectorization_control.h
deleted file mode 100644
index f05a701..0000000
--- a/src/vectorization_control.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef VECTORIZATION_CONTROL_H
-#define VECTORIZATION_CONTROL_H
-
-#ifdef SSE
-
-#define SIMD_LENGTH_float 4
-#define SIMD_LENGTH_double 2
-
-#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float
-#define OPTIMIZED_COARSE_SELF_COUPLING_float
-#define INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_float
-#define INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_float
-#define OPTIMIZED_NEIGHBOR_COUPLING_double
-#define OPTIMIZED_NEIGHBOR_COUPLING_float
-#define OPTIMIZED_SELF_COUPLING_float
-#define GRAM_SCHMIDT_VECTORIZED_float
-#define OPTIMIZED_LINALG_float
-#define OPTIMIZED_LINALG_double
-
-#include "sse_complex_float_intrinsic.h"
-#include "sse_complex_double_intrinsic.h"
-
-#endif
-
-#define OPERATOR_COMPONENT_OFFSET_float  (SIMD_LENGTH_float *((l->num_eig_vect+SIMD_LENGTH_float -1)/SIMD_LENGTH_float ))
-#define OPERATOR_COMPONENT_OFFSET_double (SIMD_LENGTH_double*((l->num_eig_vect+SIMD_LENGTH_double-1)/SIMD_LENGTH_double))
-
-#define OPERATOR_TYPE_float float
-#define OPERATOR_TYPE_double double
-
-#endif // VECTORIZATION_CONTROL_H
diff --git a/src/vectorization_dirac_generic.c b/src/vectorization_dirac_generic.c
deleted file mode 100644
index 9ea2b3e..0000000
--- a/src/vectorization_dirac_generic.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-    complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
-    int site, int *direction_flags ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = 12*offset;
-  int index_out;
-  int index_bw;
-  int index_fw;
-  int *neighbor = s->op.neighbor_table;
-  int *backward_neighbor = s->op.backward_neighbor_table;
-  complex_PRECISION *phi_pt;
-  complex_PRECISION buffer1[site_offset] __attribute__((aligned(64)));
-  complex_PRECISION buffer2[site_offset] __attribute__((aligned(64)));
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D;
-
-  // add clover term/shift
-  spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset );
-  spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset );
-
-  index_out = site;
-
-  for(int mu=0; mu<4; mu++) {
-    index_fw  = neighbor[4*index_out + mu];
-    index_bw  = backward_neighbor[4*index_out + mu];
-
-    // from backward
-    if ( direction_flags[2*mu+0] == 1 ) {
-      D_pt = D + 36*index_bw+9*mu;
-      phi_pt = phi + site_offset*index_bw;
-      mvmh_PRECISION_vectorized( buffer2+0*offset, D_pt, phi_pt+0*offset, offset );
-      mvmh_PRECISION_vectorized( buffer2+3*offset, D_pt, phi_pt+3*offset, offset );
-      mvmh_PRECISION_vectorized( buffer2+6*offset, D_pt, phi_pt+6*offset, offset );
-      mvmh_PRECISION_vectorized( buffer2+9*offset, D_pt, phi_pt+9*offset, offset );
-      twospin_PRECISION_vectorized( eta1, eta2, buffer2, offset, mu, -1.0 );
-    }
-
-    // from forward
-    if ( direction_flags[2*mu+1] == 1 ) {
-      D_pt = D + 36*index_out+9*mu;
-      phi_pt = phi + site_offset*index_fw;
-      mvm_PRECISION_vectorized( buffer1+0*offset, D_pt, phi_pt+0*offset, offset );
-      mvm_PRECISION_vectorized( buffer1+3*offset, D_pt, phi_pt+3*offset, offset );
-      mvm_PRECISION_vectorized( buffer1+6*offset, D_pt, phi_pt+6*offset, offset );
-      mvm_PRECISION_vectorized( buffer1+9*offset, D_pt, phi_pt+9*offset, offset );
-      twospin_PRECISION_vectorized( eta1, eta2, buffer1, offset, mu, 1.0 );
-    }
-  }
-}
-#endif
-
-#ifdef SSE
-void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l,
-      int site ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = 12*offset;
-  int index_out;
-  int index_fw;
-  int *neighbor = s->op.neighbor_table;
-  complex_PRECISION *phi_pt;
-  complex_PRECISION buffer[site_offset] __attribute__((aligned(64)));
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D;
-
-  index_out = site;
-
-  // requires the positive boundaries of phi to be communicated befor
-  index_fw  = neighbor[4*index_out + mu];
-  D_pt = D + 36*index_out+9*mu;
-  phi_pt = phi + site_offset*index_fw;
-  mvm_PRECISION_vectorized_simd_length( buffer+0*offset, D_pt, phi_pt+0*offset );
-  mvm_PRECISION_vectorized_simd_length( buffer+3*offset, D_pt, phi_pt+3*offset );
-  mvm_PRECISION_vectorized_simd_length( buffer+6*offset, D_pt, phi_pt+6*offset );
-  mvm_PRECISION_vectorized_simd_length( buffer+9*offset, D_pt, phi_pt+9*offset );
-  twospin2_p_PRECISION_vectorized_simd_length( eta1, eta2, buffer, mu );
-}
-#endif
-
-#ifdef SSE
-void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-    complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
-    int site ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = 12*offset;
-
-  sse_diagonal_aggregate_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+12*site, offset );
-}
-#endif
diff --git a/src/vectorization_dirac_generic.h b/src/vectorization_dirac_generic.h
deleted file mode 100644
index 5b8f02c..0000000
--- a/src/vectorization_dirac_generic.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef VECTORIZATION_DIRAC_PRECISION_HEADER
-  #define VECTORIZATION_DIRAC_PRECISION_HEADER
-
-#ifdef SSE
-  #include "sse_dirac.h"
-#endif
-
-  // caller is responsibel for checking that he needs coupling in this direction for this site
-  void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l,
-      int site );
-  
-  void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-    complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
-    int site, int *direction_flags );
-
-  void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-                                              complex_PRECISION *phi, schwarz_PRECISION_struct *s,
-                                              level_struct *l, int site );
-
-  // spinors are vectorized, gauge is same for all (use for multiple rhs)
-  static inline void mvm_PRECISION_vectorized_simd_length(
-      const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi ) {
-#ifdef SSE
-    sse_mvm_PRECISION_simd_length( eta, D, phi );
-#endif
-    
-  }
-  // spinors are vectorized, gauge is same for all (use for multiple rhs)
-  static inline void mvm_PRECISION_vectorized(
-      const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi, int elements ) {
-#ifdef SSE
-    sse_mvm_PRECISION( eta, D, phi, elements );
-#endif
-  }
-
-  // spinors are vectorized, gauge is same for all (use for multiple rhs)
-  static inline void mvmh_PRECISION_vectorized(
-      const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi, int elements ) {
-#ifdef SSE
-    sse_mvmh_PRECISION( eta, D, phi, elements );
-#endif
-  }
-  
-  // mu is according to the enum for T,Z,Y,X defined in clifford.h
-  static inline void twospin_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements, int mu, double sign ) {
-#ifdef SSE
-    sse_twospin_PRECISION( out_spin0and1, out_spin2and3, in, elements, mu, sign );
-#endif
-  }
-  static inline void twospin_p_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T, 1.0);
-  }
-  static inline void twospin_n_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T, -1.0);
-  }
-  static inline void twospin_p_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z, 1.0);
-  }
-  static inline void twospin_n_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z, -1.0);
-  }
-  static inline void twospin_p_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y, 1.0);
-  }
-  static inline void twospin_n_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y, -1.0);
-  }
-  static inline void twospin_p_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X, 1.0);
-  }
-  static inline void twospin_n_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X, -1.0);
-  }
-
-  // mu is according to the enum for T,Z,Y,X defined in clifford.h
-  static inline void twospin2_p_PRECISION_vectorized_simd_length( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int mu ) {
-#ifdef SSE
-    sse_twospin2_p_PRECISION_simd_length( out_spin0and1, out_spin2and3, in, mu );
-#endif
-  }
-  // mu is according to the enum for T,Z,Y,X defined in clifford.h
-  static inline void twospin2_p_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements, int mu ) {
-#ifdef SSE
-    sse_twospin2_p_PRECISION( out_spin0and1, out_spin2and3, in, elements, mu );
-#endif
-  }
-  static inline void twospin2_p_T_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, T);
-  }
-  static inline void twospin2_p_Z_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Z);
-  }
-  static inline void twospin2_p_Y_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, Y);
-  }
-  static inline void twospin2_p_X_PRECISION_vectorized( complex_PRECISION *out_spin0and1, complex_PRECISION *out_spin2and3, const complex_PRECISION *in, int elements ) {
-    twospin2_p_PRECISION_vectorized( out_spin0and1, out_spin2and3, in, elements, X);
-  }
-  
-  static inline void spin0and1_site_clover_PRECISION_vectorized( const complex_PRECISION *eta, const complex_PRECISION *phi,
-                                                                 const config_PRECISION clover, double shift, int elements ) {
-#ifdef SSE
-    sse_spin0and1_site_clover_PRECISION( eta, phi, clover, shift, elements );
-#endif
-  }
-  
-  static inline void spin2and3_site_clover_PRECISION_vectorized( const complex_PRECISION *eta, const complex_PRECISION *phi,
-                                                                 const config_PRECISION clover, double shift, int elements ) {
-#ifdef SSE
-    sse_spin2and3_site_clover_PRECISION( eta, phi, clover, shift, elements );
-#endif
-  }
-  
-#endif