diff --git a/.gitignore b/.gitignore
index bf1f8a9..097b117 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-*
 !.gitignore
 
 
diff --git a/CREDITS b/CREDITS
index ddbc72c..77cf852 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1,6 +1,6 @@
 This software is an outcome of the PhD thesis of Matthias Rottmann, University of Wuppertal.
 
-Code Designers: Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+Code Designers: Matthias Rottmann, Simone Bacchio, Artur Strebel, Simon Heybrock, Bjoern Leder.
 
 Contributions by: Andreas Frommer, Karsten Kahl, Stefan Krieg, Kalman Szabo, Wolfgang Soeldner,
                   Holger Arndt, Peter Georg.
diff --git a/Makefile b/Makefile
index 6abdff9..ae45ecc 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@
 CC = mpiicc 
 
 # --- CFLAGS -----------------------------------------
-CFLAGS_gnu = -std=gnu99 -Wall -pedantic -fopenmp -O3 -ffast-math -msse4.2
-CFLAGS_intel = -std=gnu99 -Wall -pedantic -qopenmp -O3  -xHOST
+CFLAGS_gnu = -std=gnu99 -Wall -pedantic -O3 -ffast-math -msse4.2 -fopenmp 
+CFLAGS_intel = -std=gnu99 -Wall -pedantic -O3  -xHOST -qopenmp 
 CFLAGS = $(CFLAGS_intel)
 
 # --- DO NOT CHANGE -----------------------------------
@@ -45,8 +45,8 @@ LIMELIB= -L$(LIMEDIR)/lib -llime
 # -DPARAMOUTPUT -DTRACK_RES -DFGMRES_RESTEST -DPROFILING
 # -DSINGLE_ALLREDUCE_ARNOLDI
 # -DCOARSE_RES -DSCHWARZ_RES -DTESTVECTOR_ANALYSIS -DDEBUG
-OPT_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) $(H5FLAGS) -DOPENMP -DSSE -DPARAMOUTPUT -DTRACK_RES
-DEVEL_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) -DOPENMP -DSSE -DDEBUG -DPARAMOUTPUT -DTRACK_RES -DFGMRES_RESTEST -DPROFILING -DCOARSE_RES -DSCHWARZ_RES -DTESTVECTOR_ANALYSIS
+OPT_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) $(H5FLAGS) -DPARAMOUTPUT -DTRACK_RES -DSSE -DOPENMP
+DEVEL_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) -DDEBUG -DPARAMOUTPUT -DTRACK_RES -DFGMRES_RESTEST -DPROFILING -DCOARSE_RES -DSCHWARZ_RES -DTESTVECTOR_ANALYSIS -DSSE -DOPENMP
 
 
 all: execs library exec-tests
@@ -92,10 +92,10 @@ $(INCDIR)/%: $(SRCDIR)/%
 	cp $(SRCDIR)/`basename $@` $@
 
 $(BUILDDIR)/%.o: $(GSRCDIR)/%.c $(SRCDIR)/*.h
-	$(CC) $(CFLAGS) $(OPT_VERSION_FLAGS) -c $< -o $@
+	$(CC) $(OPT_VERSION_FLAGS) -c $< -o $@
 
 $(BUILDDIR)/%_devel.o: $(GSRCDIR)/%.c $(SRCDIR)/*.h
-	$(CC) -g $(CFLAGS) $(DEVEL_VERSION_FLAGS) -c $< -o $@
+	$(CC) -g $(DEVEL_VERSION_FLAGS) -c $< -o $@
 
 $(GSRCDIR)/%.h: $(SRCDIR)/%.h $(firstword $(MAKEFILE_LIST))
 	cp $< $@
diff --git a/NEWS b/NEWS
index 5efb65f..6fe3f55 100644
--- a/NEWS
+++ b/NEWS
@@ -1,9 +1,25 @@
-Version v1606-sbacchio/master:
-  -Twisted mass fermions operator for Nf=2
-  -Support of different shift on even and odd sites
+
+
+Version v1610 - TM Nf=2+1+1:
+
+  - Twisted mass fermions operator for Nf=1+1
+  - Aupport of different shift on even and odd sites
   -Personalized version of the library
+  - Following parameters have been added to input file:
+    -- //TODO
+
+
+Version v1606 - TM Nf=2:
+
+  - Twisted mass fermions operator for Nf=2
+  - A different TM shift can be applied on even and odd sites
+  - Personalized version of the library
+  - Following parameters have been added to input file:
+    -- //TODO
+
 
 Version v1606 - first release:
+
   This is the first release of the DDalphaAMG solver library.
   For an overview of its features included, please consult the
   user documentation in /doc.
\ No newline at end of file
diff --git a/README b/README
index 94e2604..a4d9e8e 100644
--- a/README
+++ b/README
@@ -22,6 +22,6 @@ INSTALL:
 
 HOWTO:
 
-  After having compiled the the user documentation via
+  After having compiled the user documentation via
   "make documentation" please consult the compiled PDF in /doc for
   further information.
diff --git a/sample.ini b/sample.ini
index a8c64e6..3729bed 100644
--- a/sample.ini
+++ b/sample.ini
@@ -33,48 +33,52 @@ right hand side: 2
 | 2 - twisted boundary cond.                   |
 |         (M_PI,M_PI,M_PI,M_PI)*t.b.c.         |
 |----------------------------------------------|
-boundary conditions: 2
+boundary conditions: 1
 twisted boundary conditions: 1 0 0 0
 number of levels: 3
-number of openmp threads: 2
+number of openmp threads: 1
 
 |--- depth 0 ----------------------------------|
 d0 global lattice: 8 8 8 8  // global lattice size
-d0 local lattice: 4 8 8 8   // lattice size on each process
-
+d0 local lattice: 8 8 4 4   // lattice size on each process
                              // nproc = prod(global lattice)/prod(local lattice)
-d0 block lattice: 2 2 2 2    // Schwarz blocks
+d0 block lattice: 4 4 4 4    // Schwarz blocks
+  d0 block lattice: 2 2 2 2    // Schwarz blocks
 d0 post smooth iter: 2       // number of Schwarz cycles per MG iteration
 d0 block iter: 4
 d0 test vectors: 24          // number of test vectors used to construct the interpolation
-d0 setup iter: 4             // number of bootstrap setup iteration (excluding the initial step)
+d0 setup iter: 3             // number of bootstrap setup iteration (excluding the initial step)
 
-d2 mu factor: 4.0
-d3 mu factor: 8.0
+d1 mu factor: 5.0
+  d2 mu factor: 5.0
 
 #wilson_param                // parameters for the inverter
-#the following OR kappa: ...
-m0: -0.5
-csw: 1.0
-#the following OR 2KappaMu: ...
-mu: 0.5
-mu odd shift: 0.0
-mu even shift: 0.0
+m0: -0.4
+csw: 1.6
+mu: 0.005
+setup mu: 0.001
+  mu odd shift: 0.0
+  mu even shift: 0.0
+epsbar: 0.11
+  epsbar odd shift: 0.0
+  epsbar even shift: 0.0
 addDownPropagator: 1
 
 tolerance for relative residual: 1E-10
-iterations between restarts: 50         // should be increased for ill-conditioned cnfgs    
-maximum of restarts: 20                 // should be increased for ill-conditioned cnfgs
-coarse grid tolerance: 5E-2
-coarse grid iterations: 100             // should be increased for ill-conditioned cnfgs
-coarse grid restarts: 5                 // should be increased for ill-conditioned cnfgs
+iterations between restarts: 20
+maximum of restarts: 50        
+coarse grid tolerance: 1E-2
+coarse grid iterations: 25     
+coarse grid restarts: 20       
 
 
 #general_param
 print mode: 1
 method: 2
-mixed precision: 1
+interpolation: 2
+mixed precision: 2
 randomize test vectors: 0               // initialize random number generator with time(0) ? 0=no/1=yes
+odd even preconditioning: 1
 
 // for further information, please read the user documentation in doc/
 // developers version of an input file in sample_devel.ini
diff --git a/src/DDalphaAMG.h b/src/DDalphaAMG.h
index 5f50b32..a9556c3 100644
--- a/src/DDalphaAMG.h
+++ b/src/DDalphaAMG.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * Copyright (C) 2016, Simone Bacchio.
  * 
  * This file is part of the DDalphaAMG solver library.
  * 
@@ -145,8 +145,17 @@
    **  mg_status.success = 0: not converged, 1: converged
    **  mg_status.info = final residual
    **/
-  void DDalphaAMG_solve( double *vector_out, double *vector_in, double tol,
-		     DDalphaAMG_status *mg_status );
+  void DDalphaAMG_solve( double *vector_out, double *vector_in,
+                         double tol, DDalphaAMG_status *mg_status );
+  
+  void DDalphaAMG_solve_doublet( double *vector1_out, double *vector1_in, 
+                                 double *vector2_out, double *vector2_in,
+                                 double tol, DDalphaAMG_status *mg_status );
+
+  void DDalphaAMG_solve_ms_doublet( double **vector1_out, double *vector1_in, 
+                                    double **vector2_out, double *vector2_in,
+                                    double  *even_shifts, double *odd_shifts, int n_shifts,
+                                    double  *tol, DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Solve squared operator performing two inversions: 
@@ -154,8 +163,17 @@
    **  mg_status.success = 0: not converged, 1: converged
    **  mg_status.info = final residual
    **/
-  void DDalphaAMG_solve_squared( double *vector_out, double *vector_in, double tol,
-			     DDalphaAMG_status *mg_status );
+  void DDalphaAMG_solve_squared( double *vector_out, double *vector_in,
+                                 double tol, DDalphaAMG_status *mg_status );
+  
+  void DDalphaAMG_solve_doublet_squared( double *vector1_out, double *vector1_in, 
+                                         double *vector2_out, double *vector2_in,
+                                         double tol, DDalphaAMG_status *mg_status );
+
+  void DDalphaAMG_solve_ms_doublet_squared( double **vector1_out, double *vector1_in, 
+                                            double **vector2_out, double *vector2_in,
+                                            double  *even_shifts, double *odd_shifts, int n_shifts,
+                                            double  *tol, DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Solve squared operator against the odd compoments performing two inversions: 
@@ -163,8 +181,17 @@
    **  mg_status.success = 0: not converged, 1: converged
    **  mg_status.info = final residual
    **/
- void DDalphaAMG_solve_squared_odd( double *vector_out, double *vector_in, double tol,
-				DDalphaAMG_status *mg_status );
+  void DDalphaAMG_solve_squared_odd( double *vector_out, double *vector_in,
+                                     double tol, DDalphaAMG_status *mg_status );
+  
+  void DDalphaAMG_solve_doublet_squared_odd( double *vector1_out, double *vector1_in, 
+                                             double *vector2_out, double *vector2_in,
+                                             double tol, DDalphaAMG_status *mg_status );
+
+  void DDalphaAMG_solve_ms_doublet_squared_odd( double **vector1_out, double *vector1_in, 
+                                                double **vector2_out, double *vector2_in,
+                                                double  *even_shifts, double *odd_shifts, int n_shifts,
+                                                double  *tol, DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Solve squared operator against the even compoments performing two inversions:
@@ -172,8 +199,17 @@
    **  mg_status.success = 0: not converged, 1: converged
    **  mg_status.info = final residual
    **/
- void DDalphaAMG_solve_squared_even( double *vector_out, double *vector_in, double tol,
-				 DDalphaAMG_status *mg_status );
+  void DDalphaAMG_solve_squared_even( double *vector_out, double *vector_in,
+                                      double tol, DDalphaAMG_status *mg_status );
+  
+  void DDalphaAMG_solve_doublet_squared_even( double *vector1_out, double *vector1_in, 
+                                              double *vector2_out, double *vector2_in,
+                                              double tol, DDalphaAMG_status *mg_status );
+
+  void DDalphaAMG_solve_ms_doublet_squared_even( double **vector1_out, double *vector1_in, 
+                                                 double **vector2_out, double *vector2_in,
+                                                 double  *even_shifts, double *odd_shifts, int n_shifts,
+                                                 double  *tol, DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Apply the operator:
@@ -181,7 +217,10 @@
    **  mg_status.success = 1
    **/
   void DDalphaAMG_apply_operator( double *vector_out, double *vector_in,
-			      DDalphaAMG_status *mg_status );
+                                  DDalphaAMG_status *mg_status );
+
+  void DDalphaAMG_apply_operator_doublet( double *vector1_out, double *vector1_in,
+                                          double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Apply a preconditioner step:
@@ -190,7 +229,9 @@
    **  mg_status.info = residual after preconditioning
    **/
   void DDalphaAMG_preconditioner( double *vector_out, double *vector_in,
-			      DDalphaAMG_status *mg_status );
+                                  DDalphaAMG_status *mg_status );
+  void DDalphaAMG_preconditioner_doublet( double *vector1_out, double *vector1_in,
+                                          double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status );
 
   /*
    *  Concluding the following functions have to be call for freeing the memory and finalizing
@@ -228,11 +269,11 @@
    **    -> mg_params.conf_index_fct = NULL, mg_params.vector_index_fct = NULL;
    **/
   void DDalphaAMG_read_configuration( double *gauge_field, char *filename, int format,
-				      DDalphaAMG_status *mg_status );
+                                      DDalphaAMG_status *mg_status );
   void DDalphaAMG_read_vector( double *vector_in, char *filename, int format,
-			       DDalphaAMG_status *mg_status );
+                               DDalphaAMG_status *mg_status );
   void DDalphaAMG_write_vector( double *vector_out, char *filename, int format,
-				DDalphaAMG_status *mg_status );
+                                DDalphaAMG_status *mg_status );
 
   /**
    ** Extra - Define vector with constant or random components
@@ -414,23 +455,39 @@
      ** Hopping parameter
      **/
     double kappa;
-    
+
     /**
      ** Twisted mass parameter and shifts on even/odd sites
      **/
     double mu;
     double mu_odd_shift;
     double mu_even_shift;
-    
+
     /**
-     ** Twisted mass factor for the preconditioner on each level.
+     ** Twisted mass factor for the preconditioner on each level, l.
      **  Default 6 on the coarsest level 
      ** 
-     **   -> mu_o[l] = (mu + mu_odd_shift)  * mu_factor
-     **   -> mu_e[l] = (mu + mu_even_shift) * mu_factor
+     **   -> mu_o[l] = (mu + mu_odd_shift)  * mu_factor[l]
+     **   -> mu_e[l] = (mu + mu_even_shift) * mu_factor[l]
      **/
     double mu_factor[MAX_MG_LEVELS];  
 
+    /**
+     ** Twisted mass doublet parameter and shifts on even/odd sites
+     **/
+    double epsbar;
+    double epsbar_ig5_odd_shift;
+    double epsbar_ig5_even_shift;
+
+    /**
+     ** Twisted mass doublet factor for the preconditioner on each level, l.
+     **  Default 6 on the coarsest level 
+     ** 
+     **   -> epsbar_o[l] = ( epsbar + i * gamma_5 * epsbar_ig5_odd_shift ) * epsbar_factor[l]
+     **   -> epsbar_e[l] = ( epsbar + i * gamma_5 * epsbar_ig5_even_shift ) * epsbar_factor[l]
+     **/
+    double epsbar_factor[MAX_MG_LEVELS];  
+    
     /**
      ** Function returning the index of a element at the corresponding
      **    position (t,z,y,x are local position w.r.t the process ).
diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c
index 03680d4..65f80a2 100644
--- a/src/DDalphaAMG_interface.c
+++ b/src/DDalphaAMG_interface.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * Copyright (C) 2016, Simone Bacchio.
  * 
  * This file is part of the DDalphaAMG solver library.
  * 
@@ -101,7 +101,7 @@ void DDalphaAMG_initialize( DDalphaAMG_init *mg_init, DDalphaAMG_parameters *mg_
     threading[i] = NULL;
     MALLOC( threading[i], struct Thread, 1);
   }
-#pragma omp parallel num_threads(g.num_openmp_processes)
+  THREADED(g.num_openmp_processes)
   setup_threading(threading[omp_get_thread_num()], commonthreaddata, &l);
 
   g.conf_flag = 0;
@@ -128,22 +128,26 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_
 
   // int method;
   if ( mg_params->method != g.method ) {
+    g.method = mg_params->method;
     if( g.setup_flag ) {
       //TODO: test which cases work and what to do for making the other working
       warning0("Change of method parameter after setup not guaranteed\n");
     }
-    g.method = mg_params->method;
   } 
 
   // int interpolation;
   if ( g.interpolation != mg_params->interpolation ) {
-    //TODO: test if it always works
     g.interpolation = mg_params->interpolation;
+    if( g.setup_flag ) {
+      //TODO: test which cases work and what to do for making the other working
+      warning0("Change of interpolation parameter after setup not guaranteed\n");
+    }
   } 
   
   // int mixed_precision;
   if ( mg_params->mixed_precision != g.mixed_precision ) {
-#ifndef INIT_ONE_PREC
+    g.mixed_precision = mg_params->mixed_precision;
+#ifndef INIT_ONE_PREC //change between 1 and 2 allowed
     if( g.setup_flag && mg_params->mixed_precision * g.mixed_precision == 0 ) {
       warning0("Change from mixed_precision==0 to !=0 (or viceversa) needs a new setup.\n");
       re_setup++;
@@ -152,45 +156,48 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_
     warning0("Change of mixed_precision needs a new setup.\n");
     re_setup++;
 #endif
-    g.mixed_precision = mg_params->mixed_precision;
   }
 
   // int block_lattice[MAX_MG_LEVELS][4];
   for ( i=0; i<g.num_levels; i++ )
     for ( j=0; j<4; j++ )
       if (g.block_lattice[i][j] != mg_params->block_lattice[i][j]) {
-	if (g.setup_flag)
-	  re_setup++;
-	g.block_lattice[i][j] = mg_params->block_lattice[i][j];
-	// TODO: add check
+        g.block_lattice[i][j] = mg_params->block_lattice[i][j];
+        parameter_update(&l);
+        if (g.setup_flag) {
+          warning0("Change of block_lattice needs a new setup.\n");
+          re_setup++;
+        }
       }
-
+  
   // int mg_basis_vectors[MAX_MG_LEVELS-1];
   l_tmp=&l;
   for ( i=0; i<g.num_levels; i++ ){
     if ( mg_params->mg_basis_vectors[i] != g.num_eig_vect[i] ) {
+      g.num_eig_vect[i] = mg_params->mg_basis_vectors[i];
+      if( i==0 )
+        parameter_update(&l);
       if( g.setup_flag ) {
-	if ( mg_params->mg_basis_vectors[i] < g.num_eig_vect[i] )
-	  re_projs++; //TODO: check if it works
-	else
-	  re_setup++;
+        if ( mg_params->mg_basis_vectors[i] < g.num_eig_vect[i] )
+          re_projs++; //TODO: check if this works
+        else { //TODO just compute the extra vectors
+          warning0("Increasing mg_basis_vectors needs a new setup.\n");
+          re_setup++;
+        }
       }
-      g.num_eig_vect[i] = mg_params->mg_basis_vectors[i];
-      if( g.setup_flag || i==0 )
-	l_tmp->num_eig_vect = mg_params->mg_basis_vectors[i];
     }
     if( g.setup_flag )
       l_tmp = l_tmp->next_level;
   }
-
+  
   // int setup_iterations[MAX_MG_LEVELS];
   l_tmp=&l;
   for ( i=0; i<g.num_levels; i++ ){
     if ( mg_params->setup_iterations[i] != g.setup_iter[i] ) {
       g.setup_iter[i] = mg_params->setup_iterations[i];
       if( (g.setup_flag && i>0) || (!g.setup_flag && i==0) ) 
-	//after setup, l.setup_iter[i] is used as a counter for total number of setup iters
-	l_tmp->setup_iter = mg_params->setup_iterations[i];
+        //after setup, l.setup_iter[i] is used as a counter for total number of setup iters
+        l_tmp->setup_iter = mg_params->setup_iterations[i];
     }
     if( g.setup_flag )
       l_tmp = l_tmp->next_level;
@@ -212,22 +219,22 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_
     if (l_tmp->level > 0) {
       // double kcycle_tolerance;
       if ( mg_params->kcycle_tolerance != g.kcycle_tol ) {
-	g.kcycle_tol = mg_params->kcycle_tolerance;
-	if( g.setup_flag || i==0 ) {	
-	  if ( g.mixed_precision )
-	    l_tmp->p_float.tol = g.kcycle_tol;
-	  else
-	    l_tmp->p_float.tol = g.kcycle_tol;
-	}
+        g.kcycle_tol = mg_params->kcycle_tolerance;
+        if( g.setup_flag || i==0 ) {  
+          if ( g.mixed_precision )
+            l_tmp->p_float.tol = g.kcycle_tol;
+          else
+            l_tmp->p_float.tol = g.kcycle_tol;
+        }
       }
     } else {
       // double coarse_tolerance;
       if ( mg_params->coarse_tolerance != g.coarse_tol ){
-	g.coarse_tol = mg_params->coarse_tolerance;
-	if (g.setup_flag && g.mixed_precision )
-	  l_tmp->p_float.tol = g.coarse_tol;
-	else if(g.setup_flag)
-	  l_tmp->p_float.tol = g.coarse_tol;
+        g.coarse_tol = mg_params->coarse_tolerance;
+        if (g.setup_flag && g.mixed_precision )
+          l_tmp->p_float.tol = g.coarse_tol;
+        else if(g.setup_flag)
+          l_tmp->p_float.tol = g.coarse_tol;
       }
     }
     
@@ -236,32 +243,74 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_
     else
       break;
   }
-
+  
   // double kappa;
   m0 = 1./(2.*mg_params->kappa)-4.; 
-  if( creal(l.dirac_shift)!= m0 ){
+  if( g.m0 != m0 ){
+    g.m0 = m0;
+    THREADED(threading[0]->n_core)
+      if ( g.setup_flag )
+        m0_update( g.m0, &l, threading[omp_get_thread_num()] );
+      else if ( g.conf_flag )
+        m0_update_double( g.m0, &(g.op_double), &l, threading[omp_get_thread_num()] );
     re_dirac++;
   }
-
+  
   // double mu;
   // double mu_odd_shift;
   // double mu_even_shift;
   // double mu_factor[MAX_MG_LEVELS];
 #ifdef HAVE_TM
-  if( mg_params->mu != g.tm_mu || mg_params->mu_odd_shift != g.tm_mu_odd_shift || 
-                                  mg_params->mu_even_shift != g.tm_mu_even_shift){
-    g.setup_tm_mu = mg_params->mu;
-    g.tm_mu = mg_params->mu;
-    g.tm_mu_even_shift = mg_params->mu_even_shift;
-    g.tm_mu_odd_shift = mg_params->mu_odd_shift;
+  int update_mu = 0;
+  for ( i=0; i<g.num_levels; i++ )
+    if (mg_params->mu_factor[i] != g.mu_factor[i] ) {
+      g.mu_factor[i] = mg_params->mu_factor[i];
+      update_mu = 1;
+    }
+
+  if( update_mu || mg_params->mu != g.mu || mg_params->mu_odd_shift != g.mu_odd_shift || 
+      mg_params->mu_even_shift != g.mu_even_shift ){
+    g.setup_mu = mg_params->mu;
+    g.mu = mg_params->mu;
+    g.mu_even_shift = mg_params->mu_even_shift;
+    g.mu_odd_shift = mg_params->mu_odd_shift;
+    THREADED(threading[0]->n_core)
+      if ( g.setup_flag )
+        tm_term_update( g.mu, &l, threading[omp_get_thread_num()] );
+      else if ( g.conf_flag )
+        tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, &(g.op_double), &l, threading[omp_get_thread_num()] ); 
     re_dirac++;
   }
   
+#else
+  if ( mg_params->mu != 0 || mg_params->mu_odd_shift != 0 || mg_params->mu_even_shift != 0 )
+    warning0("Parameters mu, mu_odd_shift, mu_even_shift not supported without defining HAVE_TM flag.");
+#endif
+
+#ifdef HAVE_TM1p1  
+  int update_eps = 0;
+
   for ( i=0; i<g.num_levels; i++ )
-    if (mg_params->mu_factor[i] != g.tm_mu_factor[i] ) {
-      g.tm_mu_factor[i] = mg_params->mu_factor[i];
-      re_dirac++;
-    }
+    if (mg_params->epsbar_factor[i] != g.epsbar_factor[i] ) {
+      g.epsbar_factor[i] = mg_params->epsbar_factor[i];
+      update_eps = 1;
+     }
+
+  if( update_eps || mg_params->epsbar != g.epsbar || mg_params->epsbar_ig5_odd_shift != g.epsbar_ig5_odd_shift || mg_params->epsbar_ig5_even_shift != g.epsbar_ig5_even_shift ){
+    g.epsbar = mg_params->epsbar;
+    g.epsbar_ig5_even_shift = mg_params->epsbar_ig5_even_shift;
+    g.epsbar_ig5_odd_shift = mg_params->epsbar_ig5_odd_shift;
+    THREADED(threading[0]->n_core)
+      if ( g.setup_flag )
+        epsbar_term_update( &l, threading[omp_get_thread_num()] );
+      else if ( g.conf_flag )
+        epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, &(g.op_double), &l, threading[omp_get_thread_num()] ); 
+    re_dirac++;
+  }
+  
+#else
+  if ( mg_params->epsbar != 0 || mg_params->epsbar_ig5_odd_shift != 0 || mg_params->epsbar_ig5_even_shift != 0 )
+    warning0("Parameters epsbar, epsbar_odd_shift, epsbar_even_shift not supported without defining HAVE_TM1p1 flag.");
 #endif
 
   // int (*conf_index_fct)(int t, int z, int y, int x, int mu);
@@ -271,78 +320,27 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_
   
   // int print;
   g.print = mg_params->print;
-
+  
   // UPDATING
-  if ( re_setup && g.setup_flag ){
-    if ( re_dirac ) {
-      if( creal(l.dirac_shift)!= m0 )
-#pragma omp parallel num_threads(threading[0]->n_core)
-	shift_update_double( &(g.op_double), m0, &l, threading[omp_get_thread_num()] );
-#ifdef HAVE_TM
-      l.tm_shift = g.tm_mu;
-      l.tm_even_shift = g.tm_mu_even_shift;
-      l.tm_odd_shift = g.tm_mu_odd_shift; 
-#pragma omp parallel num_threads(threading[0]->n_core)
-      tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]);
-#endif
-    }
-    l.dirac_shift = m0;
+  if ( re_setup && g.setup_flag ){ // destroy and repeate setup
     DDalphaAMG_setup( mg_status ); // TODO handle status
 
-  } else if ( re_projs && g.setup_flag ) {
-    if ( re_dirac ) {
-#pragma omp parallel num_threads(threading[0]->n_core)
-      if( creal(l.dirac_shift)!= m0 ) {
-	shift_update_double( &(g.op_double), m0, &l, threading[omp_get_thread_num()] );
-	shift_update_float( &(g.op_float), m0, &l, threading[omp_get_thread_num()] );
-	if(l.s_double.op.clover != NULL) 
-	  shift_update_double( &(l.s_double.op), m0, &l, threading[omp_get_thread_num()] );
-	if ( l.s_float.op.clover != NULL )
-	  shift_update_float( &(l.s_float.op), m0, &l, threading[omp_get_thread_num()] );
-      }
-#ifdef HAVE_TM
-      l.tm_shift = g.tm_mu;
-      l.tm_even_shift = g.tm_mu_even_shift;
-      l.tm_odd_shift = g.tm_mu_odd_shift; 
-#pragma omp parallel num_threads(threading[0]->n_core)
-      {
-	tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]);
-	tm_term_float_setup( g.op_float.tm_term, g.op_float.odd_proj, &l, threading[omp_get_thread_num()] );
-	if(l.s_double.op.tm_term != NULL) 
-	  tm_term_double_setup( l.s_double.op.tm_term, l.s_double.op.odd_proj, &l, threading[omp_get_thread_num()] ); 
-	if ( l.s_float.op.tm_term != NULL )
-	  tm_term_float_setup( l.s_float.op.tm_term, l.s_float.op.odd_proj, &l, threading[omp_get_thread_num()] );
-      }
-#endif
-    }
-    l.dirac_shift = m0;
+  } else if ( re_projs && g.setup_flag ) { //project again the operators
     if ( g.mixed_precision )
-#pragma omp parallel num_threads(threading[0]->n_core)
-      re_setup_float( &l, threading[omp_get_thread_num()] ); 
+      THREADED(threading[0]->n_core)
+        re_setup_float( &l, threading[omp_get_thread_num()] ); 
     else
-#pragma omp parallel num_threads(threading[0]->n_core)
-      re_setup_double( &l, threading[omp_get_thread_num()] );
-
-  } else if ( (re_dirac && g.conf_flag) || re_projs || re_setup ) {
-    if (g.setup_flag ) 
-#pragma omp parallel num_threads(threading[0]->n_core)
-      optimized_shift_update( m0, &l, threading[omp_get_thread_num()]);
-    else {
-      if( creal(l.dirac_shift)!= m0 )
-#pragma omp parallel num_threads(threading[0]->n_core)
-	shift_update_double( &(g.op_double), m0, &l, threading[omp_get_thread_num()] );
-#ifdef HAVE_TM
-      l.tm_shift = g.tm_mu;
-      l.tm_even_shift = g.tm_mu_even_shift;
-      l.tm_odd_shift = g.tm_mu_odd_shift; 
-#pragma omp parallel num_threads(threading[0]->n_core)
-      tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]);
-#endif
-    }
+      THREADED(threading[0]->n_core)
+        re_setup_double( &l, threading[omp_get_thread_num()] );
+    
+  } else if ( re_dirac && g.setup_flag ) { //update just the oddeven and vecorized operators
+    THREADED(threading[0]->n_core)
+      finalize_operator_update( &l, threading[omp_get_thread_num()]);
   }
   
+  
   DDalphaAMG_get_parameters( mg_params );
-
+  
   t1 = MPI_Wtime();
   
   mg_status->success = 1+re_setup;// 1: OK, 2: re_setup done
@@ -353,7 +351,7 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_
 }
 
 void DDalphaAMG_change_mu_sign( DDalphaAMG_status *mg_status ) {
-
+  
   double t0, t1;
   t0 = MPI_Wtime();
   g.coarse_time = 0;
@@ -362,33 +360,35 @@ void DDalphaAMG_change_mu_sign( DDalphaAMG_status *mg_status ) {
   mg_status->success = 0;
   mg_status->info = 0;  
   
-  g.tm_mu *= -1;
-  g.tm_mu_even_shift *= -1;
-  g.tm_mu_odd_shift *= -1;
+#ifdef HAVE_TM
+  g.mu *= -1;
+  g.mu_even_shift *= -1;
+  g.mu_odd_shift *= -1;
 
   if (g.conf_flag && !g.setup_flag ) {
     
-    l.tm_shift = g.tm_mu;
-    l.tm_even_shift = g.tm_mu_even_shift;
-    l.tm_odd_shift = g.tm_mu_odd_shift; 
-      
-#pragma omp parallel num_threads(threading[0]->n_core)	
-    tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]);
+    THREADED(threading[0]->n_core)
+    tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, &(g.op_double), &l, threading[omp_get_thread_num()]);
     
   } else if (g.conf_flag && g.setup_flag )
-#pragma omp parallel num_threads(threading[0]->n_core)    
-    optimized_shift_update( l.dirac_shift, &l, threading[omp_get_thread_num()]);
-  
+    THREADED(threading[0]->n_core) {
+      tm_term_update( g.mu, &l, threading[omp_get_thread_num()] );
+      finalize_operator_update( &l, threading[omp_get_thread_num()] );
+    }
+  mg_status->info = g.mu;
+#else
+  warning0("DDalphaAMG_change_mu_sign called without the flag HAVE_TM enabled. Doing nothing.\n");
+  mg_status->info = 0;
+#endif
+
   t1 = MPI_Wtime();
   
   mg_status->success = 1;// 1: OK, 2: re_setup done
   mg_status->time = t1-t0;
-  mg_status->info = g.tm_mu;
   mg_status->coarse_time = g.coarse_time;
   
 }
 
-
 void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_status ) {
   
   int t, z, y, x, mu, i, j, k;
@@ -407,28 +407,28 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st
   if ( g.print > 0 ) printf0("%s\n", CLIFFORD_BASIS );
   if ( g.bc == _ANTIPERIODIC ) printf0("antiperiodic in time");
   else if ( g.bc == _TWISTED ) printf0("twisted (%.2f, %.2f, %.2f, %.2f)", g.twisted_bc[0], 
-				       g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]);
+               g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]);
   else printf0("periodic in time");
   printf0(" boundary conditions \n");
 
   SU3_storage_alloc( &U, &l );
-
+  
   if(g.bc == _ANTIPERIODIC && onb[T] ) {
     phase[Z] = 1; phase[Y] = 1; phase[X] = 1;
     for ( t=1, i=0, k=0; t<ll[T]+1; t++ ) {
       if (t<ll[T]) phase[T] = 1; 
       else phase[T] = -1;
       for ( z=1; z<ll[Z]+1; z++ )
-	for ( y=1; y<ll[Y]+1; y++ )
-	  for ( x=1; x<ll[X]+1; x++ )
-	    for ( mu=0; mu<4; mu++ ) {
-	      if ( conf_index_fct != NULL )
-		k = conf_index_fct( t-1, z-1, y-1, x-1, mu );
-	      for (j=0; j<9; j++, i++, k+=2) {
-		g.op_double.D[i] = 0.5*phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
-		U[t][z][y][x][mu][j] = phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
-	      }
-	    }	   
+        for ( y=1; y<ll[Y]+1; y++ )
+          for ( x=1; x<ll[X]+1; x++ )
+            for ( mu=0; mu<4; mu++ ) {
+              if ( conf_index_fct != NULL )
+                k = conf_index_fct( t-1, z-1, y-1, x-1, mu );
+              for (j=0; j<9; j++, i++, k+=2) {
+                g.op_double.D[i] = 0.5*phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
+                U[t][z][y][x][mu][j] = phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
+              }
+            }     
     }
   }
   else if(g.bc == _TWISTED && ( onb[T] || onb[Z] || onb[Y] || onb[X] ))
@@ -436,44 +436,44 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st
       if ( !onb[T] || t<ll[T] || g.twisted_bc[T]==0) phase[T] = 1; 
       else phase[T] = cexp(I*g.twisted_bc[T]);
       for ( z=1; z<ll[Z]+1; z++ ) {
-	if ( !onb[Z] || z<ll[Z] || g.twisted_bc[Z]==0) phase[Z] = 1; 
-	else phase[Z] = cexp(I*g.twisted_bc[Z]);
-	for ( y=1; y<ll[Y]+1; y++ ) {
-	  if ( !onb[Y] || y<ll[Y] || g.twisted_bc[Y]==0) phase[Y] = 1; 
-	  else phase[Y] = cexp(I*g.twisted_bc[Y]);
-	  for ( x=1; x<ll[X]+1; x++ ) {
-	    if ( !onb[X] || x<ll[X] || g.twisted_bc[X]==0) phase[X] = 1; 
-	    else phase[X] = cexp(I*g.twisted_bc[X]);
-	    for ( mu=0; mu<4; mu++ ) {
-	      if ( conf_index_fct != NULL )
-		k = conf_index_fct( t-1, z-1, y-1, x-1, mu );
-	      for (j=0; j<9; j++, i++, k+=2) {
-		g.op_double.D[i] = 0.5*phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
-		U[t][z][y][x][mu][j] = phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
-	      }
-	    }
-	  }
-	}
+        if ( !onb[Z] || z<ll[Z] || g.twisted_bc[Z]==0) phase[Z] = 1; 
+        else phase[Z] = cexp(I*g.twisted_bc[Z]);
+        for ( y=1; y<ll[Y]+1; y++ ) {
+          if ( !onb[Y] || y<ll[Y] || g.twisted_bc[Y]==0) phase[Y] = 1; 
+          else phase[Y] = cexp(I*g.twisted_bc[Y]);
+          for ( x=1; x<ll[X]+1; x++ ) {
+            if ( !onb[X] || x<ll[X] || g.twisted_bc[X]==0) phase[X] = 1; 
+            else phase[X] = cexp(I*g.twisted_bc[X]);
+            for ( mu=0; mu<4; mu++ ) {
+              if ( conf_index_fct != NULL )
+                k = conf_index_fct( t-1, z-1, y-1, x-1, mu );
+              for (j=0; j<9; j++, i++, k+=2) {
+                g.op_double.D[i] = 0.5*phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
+                U[t][z][y][x][mu][j] = phase[mu]*(gauge_field[k]+I*gauge_field[k+1]);
+              }
+            }
+          }
+        }
       }
     }
   
   else
     for ( t=1, i=0, k=0; t<ll[T]+1; t++ )
       for ( z=1; z<ll[Z]+1; z++ )
-	for ( y=1; y<ll[Y]+1; y++ )
-	  for ( x=1; x<ll[X]+1; x++ )
-	    for ( mu=0; mu<4; mu++ ) {
-	      if ( conf_index_fct != NULL )
-		k = conf_index_fct( t-1, z-1, y-1, x-1, mu );
-	      for (j=0; j<9; j++, i++, k+=2) {
-		g.op_double.D[i] = 0.5*(gauge_field[k]+I*gauge_field[k+1]);
-		U[t][z][y][x][mu][j] = (gauge_field[k]+I*gauge_field[k+1]);
-	      }
-	    }
-
+        for ( y=1; y<ll[Y]+1; y++ )
+          for ( x=1; x<ll[X]+1; x++ )
+            for ( mu=0; mu<4; mu++ ) {
+              if ( conf_index_fct != NULL )
+                k = conf_index_fct( t-1, z-1, y-1, x-1, mu );
+              for (j=0; j<9; j++, i++, k+=2) {
+                g.op_double.D[i] = 0.5*(gauge_field[k]+I*gauge_field[k+1]);
+                U[t][z][y][x][mu][j] = (gauge_field[k]+I*gauge_field[k+1]);
+              }
+            }
+  
   SU3_ghost_update( &U, &l );
   if ( g.print > 0 ) printf0("Configuration stored...\n");
-
+  
   compute_clover_term( U, &l );
   
   // calculate the plaquette
@@ -482,7 +482,7 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st
     
   SU3_storage_free( &U, &l );
   //END: dirac_setup
-
+  
   mg_status->success = 1;
   g.conf_flag = 1;
   mg_status->info = g.plaq;
@@ -492,7 +492,7 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st
       schwarz_double_setup( &(l.s_double), &(g.op_double), &l );
     if(l.s_float.op.clover != NULL)
       schwarz_float_setup( &(l.s_float), &(g.op_double), &l );
-#pragma omp parallel num_threads(threading[0]->n_core)
+    THREADED(threading[0]->n_core)
     if ( g.mixed_precision ) 
       operator_updates_float( &l, threading[omp_get_thread_num()] );
     else
@@ -522,7 +522,7 @@ void DDalphaAMG_setup( DDalphaAMG_status * mg_status ) {
   if(g.conf_flag == 1) {
     if ( g.setup_flag )
       method_free( &l );
-#pragma omp parallel num_threads(threading[0]->n_core)
+    THREADED(threading[0]->n_core)
     {
       method_setup( NULL, &l, threading[omp_get_thread_num()] );
       method_update( g.setup_iter[0], &l, threading[omp_get_thread_num()] );
@@ -552,7 +552,7 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) {
     mg_status->success = 0;
     mg_status->info = 0;
 
-#pragma omp parallel num_threads(threading[0]->n_core) 
+    THREADED(threading[0]->n_core) 
     method_update( iterations, &l, threading[omp_get_thread_num()] );
     //    method_update( iterations, &l, no_threading );
     
@@ -571,13 +571,101 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) {
   }
 }
 
+static inline void vector_copy( vector_double vector_out, vector_double vector_in )
+{
+  THREADED(threading[0]->n_core) {
+    int start = threading[omp_get_thread_num()]->start_index[0], 
+      end = threading[omp_get_thread_num()]->end_index[0];
+    vector_double_copy( vector_out, vector_in, start, end, &l );
+  }  
+}
+
+static inline void solver( )
+{
+  THREADED(threading[0]->n_core)
+    if ( g.method == -1 ) {
+      cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
+    } else if ( g.mixed_precision == 2 ) {
+      fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
+    } else {
+      fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
+    }
+}
+
+static inline void correct_guess( vector_double guess, vector_double solution, vector_double solution2,
+                                  double  even_dshift, double odd_dshift )
+{
+  // guess = D^{-1}*rhs - i*dshift*D^{-2}*rhs 
+  THREADED(threading[0]->n_core) {
+    int start = threading[omp_get_thread_num()]->start_index[0], 
+      end = threading[omp_get_thread_num()]->end_index[0];
+    if( odd_dshift == 0 || even_dshift == 0 || even_dshift == odd_dshift ) {
+      double dshift = ( odd_dshift == 0 ) ? even_dshift:odd_dshift;
+      printf0("correcting with dshift %le\n", dshift);
+      vector_double_scale( guess, solution2, -I*dshift, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()]);
+      vector_double_plus( guess, guess, solution, start, end, &l );
+    } else
+      vector_double_copy( guess, solution, start, end, &l );
+  }  
+}
+
+static inline void change_epsbar_shift_sign( ) {
+  
+#ifdef HAVE_TM1p1  
+  if ( g.epsbar_ig5_even_shift !=0 || g.epsbar_ig5_odd_shift !=0 ) {
+    g.epsbar_ig5_even_shift *= -1;
+    g.epsbar_ig5_odd_shift *= -1;
+
+    if (g.conf_flag && !g.setup_flag ) {
+      
+      THREADED(threading[0]->n_core) {
+        epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, &(g.op_double),
+                                  &l, threading[omp_get_thread_num()]);
+      }
+    } else if (g.conf_flag && g.setup_flag )
+      THREADED(threading[0]->n_core) {
+        epsbar_term_update( &l, threading[omp_get_thread_num()] );
+        finalize_operator_update( &l, threading[omp_get_thread_num()] );
+      }
+  }
+#else
+  warning0("change_epsbar_shift_sign called without the flag HAVE_TM1p1 enabled. Doing nothing.\n");
+#endif
+}
 
 enum {_SOLVE, _SOLVE_SQ, _SOLVE_SQ_ODD, _SOLVE_SQ_EVEN, _PRECOND, _OPERATOR};
-void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status *mg_status, int _TYPE ) {
+
+// NOTE RESIDUAL
+//
+// The _SOLVE_SQ invert the squared operator in two inversion.
+// One has to be careful to return a solution with the right residual.
+//
+// We have:
+//   D^2 = Dd D
+//
+//   Dd D x = b   direct solution
+//   Dd  y  = b   first step
+//      D x = y   second step
+//
+//   r1 = Dd   y - b
+//   r2 =    D x - y
+//   r  = Dd D x - b = Dd r2 + r1
+//
+//  |r| < tol -->  |r| < |Dd| |r2| + |r1| < tol
+//
+// For the residual we do
+//  |r1| < tol/2
+//  |r2| < (tol - |r1|)/|Dd| using |Dd|=8 since is |Dd|<8
+//
+// With relative residual we have
+//  |r1|/|b| < tol/2
+//  |r2|/|y| < (tol - |r1|)/|Dd|*|b|/|y|
+
+static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, double *vector2_out, double *vector2_in, double tol, DDalphaAMG_status *mg_status, int _TYPE ) {
   
   int t, z, y, x, i, j, k, mu, *ll = l.local_lattice, *gl=l.global_lattice, sl[4], precision_changed;
-  complex_double twisted_bc, tmp;
-  double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp;
+  complex_double twisted_bc, tmp1, tmp2;
+  double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2;
   gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p);
   vector_double vb, rhs = p->b;
   vector_double vx, sol = p->x;
@@ -590,63 +678,97 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status
   g.coarse_iter_count = 0;
   mg_status->success = 0;
   mg_status->info = 0;
+  
+  ASSERT(vector1_out!=NULL);
+  ASSERT(vector1_in!=NULL);
+#ifdef HAVE_TM1p1
+  if(g.n_flavours==2) {
+    ASSERT(vector2_out!=NULL);
+    ASSERT(vector2_in!=NULL);
+  }
+#endif
+
+  if(g.mixed_precision!=2)
+    g.p.tol = tol;
+  else
+    g.p_MP.dp.tol = tol;
 
   for (i=0; i<4; i++)
     sl[i] = ll[i]*g.my_coords[i];
-  /*  
-      #ifndef INIT_ONE_PREC
-  if ( g.mixed_precision==2 || vector_index_fct!=NULL || g.bc==_TWISTED)
-  #else
-  if ( vector_index_fct!=NULL || g.bc==_TWISTED)
-  #endif
-  */
+ 
   for (t=0, j=0; t<ll[T]; t++) {
     if (g.bc==_TWISTED) phase[T] = g.twisted_bc[T]*((double)sl[T]+t)/(double)gl[T];
     for (z=0; z<ll[Z]; z++) {
       if (g.bc==_TWISTED) phase[Z] = phase[T] + g.twisted_bc[Z]*((double)sl[Z]+z)/(double)gl[Z];
       for (y=0; y<ll[Y]; y++) {
-	if (g.bc==_TWISTED) phase[Y] = phase[Z] + g.twisted_bc[Y]*((double)sl[Y]+y)/(double)gl[Y];
-	for (x=0; x<ll[X]; x++) {
-	  if (g.bc==_TWISTED) {
-	    phase[X] = phase[Y] + g.twisted_bc[X]*((double)sl[X]+x)/(double)gl[X];
-	    twisted_bc = cexp(I*phase[X]);
-	  } else
-	    twisted_bc = 1.;
-	  if(vector_index_fct!=NULL )
-	    i = vector_index_fct( t, z, y, x );
-	  else 
-	    i = 2*j;
-	  
-	  for ( mu=0; mu<4; mu++ )
-	    for ( k=0; k<3; k++, j++ ) {
+        if (g.bc==_TWISTED) phase[Y] = phase[Z] + g.twisted_bc[Y]*((double)sl[Y]+y)/(double)gl[Y];
+        for (x=0; x<ll[X]; x++) {
+          if (g.bc==_TWISTED) {
+            phase[X] = phase[Y] + g.twisted_bc[X]*((double)sl[X]+x)/(double)gl[X];
+            twisted_bc = cexp(I*phase[X]);
+          } else
+            twisted_bc = 1.;
+          if(vector_index_fct!=NULL )
+            i = vector_index_fct( t, z, y, x );
+          else 
+            i = 2*j;
+          
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) {
+            for ( mu=0; mu<4; mu++ ) {
+              for ( k=0; k<3; k++, j++ ) {
+#ifndef BASIS4 
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc;
+
+#else
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+#endif
+                
+#ifndef INIT_ONE_PREC
+                if(g.mixed_precision==2) {
+                  vtmp=cabs(rhs[j]);
+                  if(vtmp > vmax)
+                    vmax=vtmp;
+                  if( vtmp > EPS_double && vtmp < vmin )
+                    vmin=vtmp;
+                  vtmp=cabs(rhs[j+6]);
+                  if(vtmp > vmax)
+                    vmax=vtmp;
+                  if( vtmp > EPS_double && vtmp < vmin )
+                    vmin=vtmp;
+                }
+              }
+#endif
+              if(mu%2)
+                j+=6;
+            }
+          } else
+#endif
+            for ( mu=0; mu<4; mu++ )
+              for ( k=0; k<3; k++, j++ ) {
 #ifndef BASIS4 
-	      rhs[j] = ((complex_double)vector_in[i+2*(k+3*mu)] + I*(complex_double)vector_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
 #else
-	      rhs[j] = ((complex_double)vector_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
 #endif
-	      
+                
 #ifndef INIT_ONE_PREC
-	      if(g.mixed_precision==2) {
-		vtmp=cabs(rhs[j]);
-		if(vtmp > vmax)
-		  vmax=vtmp;
-		if( vtmp > EPS_double && vtmp < vmin )
-		  vmin=vtmp;
-	      }
-	    }
+                if(g.mixed_precision==2) {
+                  vtmp=cabs(rhs[j]);
+                  if(vtmp > vmax)
+                    vmax=vtmp;
+                  if( vtmp > EPS_double && vtmp < vmin )
+                    vmin=vtmp;
+                }
+              }
 #endif
-	}
+        }
       }
     }
   }
-  
-  /*
-    else {
-    p->b = (vector_double) vector_in;
-    p->x = (vector_double) vector_out;
-    }
-  */
-    
+   
 #ifndef INIT_ONE_PREC
     
   double gvmin, gvmax;
@@ -667,112 +789,148 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status
     p->b = g.p_MP.dp.b;
     p->x = g.p_MP.dp.x;
     p->tol = g.p_MP.dp.tol;
-  } else precision_changed=0;
+  } else precision_changed = 0;
 #endif
-  
+
   switch(_TYPE) {
     
   case _SOLVE :
-#pragma omp parallel num_threads(threading[0]->n_core)
-    if ( g.method == -1 ) {
-      cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    } else if ( g.mixed_precision == 2 ) {
-      fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-    } else {
-      fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    }
+    solver( );
     break;
 
   case _SOLVE_SQ :
-#pragma omp parallel num_threads(threading[0]->n_core)
-    {
-      // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
-      gamma5_double(rhs, rhs, &l, threading[omp_get_thread_num()] );
+    THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs
+        tau1_gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] );
+      else
+#endif
+        // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
+        gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] );
+    
+    // read NOTE RESIDUAL
+    THREADED(threading[0]->n_core)
+      nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+    p->tol = tol/2.;
+    solver( );
       
-      if ( g.method == -1 ) {
-	cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-      } else if ( g.mixed_precision == 2 ) {
-	fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-      } else {
-	fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-      }
+    THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        tau1_gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] );
+      else
+#endif
+        gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] );
+ 
+#ifdef HAVE_TM1p1
+    if(g.n_flavours==2) 
+      change_epsbar_shift_sign( );
+    else
+#endif
+      DDalphaAMG_change_mu_sign( &tmp_status );
+
+    // read NOTE RESIDUAL
+    THREADED(threading[0]->n_core)
+      nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+    p->tol = (tol-g.norm_res)*nrhs/nrhs2/8.;
+    solver( );
 
-      gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] );
-    }
-    DDalphaAMG_change_mu_sign( &tmp_status );
-#pragma omp parallel num_threads(threading[0]->n_core)
-    if ( g.method == -1 ) {
-      cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    } else if ( g.mixed_precision == 2 ) {
-      fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-    } else {
-      fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    }
     // DDalphaAMG_change_mu_sign( &tmp_status );
     warning0("sign of mu changed during the inversion of squared operator\n");
     break;
     
   case _SOLVE_SQ_ODD :    
-#pragma omp parallel num_threads(threading[0]->n_core)
-    {
-      // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
-      vector_double_gamma5_set_even_to_zero(rhs, rhs, &l, threading[omp_get_thread_num()]);
-      if ( g.method == -1 ) {
-	cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-      } else if ( g.mixed_precision == 2 ) {
-	fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-      } else {
-	fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-      }
-      vector_double_gamma5_set_even_to_zero(rhs, sol, &l, threading[omp_get_thread_num()]);
-    }
-    DDalphaAMG_change_mu_sign( &tmp_status );
-#pragma omp parallel num_threads(threading[0]->n_core)
-    if ( g.method == -1 ) {
-      cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    } else if ( g.mixed_precision == 2 ) {
-      fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-    } else {
-      fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    }
+    THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs
+        tau1_gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+      else
+#endif
+        // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
+        gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+
+    // read NOTE RESIDUAL
+    THREADED(threading[0]->n_core)
+      nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+    p->tol = tol/2.;
+    solver( );
+
+    THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        tau1_gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+      else
+#endif
+        gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+ 
+#ifdef HAVE_TM1p1
+    if(g.n_flavours==2) 
+      change_epsbar_shift_sign( );
+    else
+#endif
+      DDalphaAMG_change_mu_sign( &tmp_status );
+
+    // read NOTE RESIDUAL
+    THREADED(threading[0]->n_core)
+      nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+    p->tol = (tol-g.norm_res)*nrhs/nrhs2/8.;
+    solver( );
+
     // DDalphaAMG_change_mu_sign( &tmp_status );
     warning0("sign of mu changed during the inversion of squared operator\n");
     break;
     
   case _SOLVE_SQ_EVEN :    
-#pragma omp parallel num_threads(threading[0]->n_core)
-    {
-      // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
-      vector_double_gamma5_set_odd_to_zero(rhs, rhs, &l, threading[omp_get_thread_num()]);
-      if ( g.method == -1 ) {
-	cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-      } else if ( g.mixed_precision == 2 ) {
-	fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-      } else {
-	fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-      }
-      vector_double_gamma5_set_odd_to_zero(rhs, sol, &l, threading[omp_get_thread_num()]);
-    }
-    DDalphaAMG_change_mu_sign( &tmp_status );
-#pragma omp parallel num_threads(threading[0]->n_core)
-    if ( g.method == -1 ) {
-      cgn_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    } else if ( g.mixed_precision == 2 ) {
-      fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] );
-    } else {
-      fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] );
-    }
+    THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs
+        tau1_gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+      else
+#endif
+        // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
+        gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+
+    // read NOTE RESIDUAL
+    THREADED(threading[0]->n_core)
+      nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+    p->tol = tol/2.;
+    solver( );
+
+    THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        tau1_gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+      else
+#endif
+        gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+
+#ifdef HAVE_TM1p1
+    if(g.n_flavours==2) 
+      change_epsbar_shift_sign( );
+    else
+#endif
+      DDalphaAMG_change_mu_sign( &tmp_status );
+
+    // read NOTE RESIDUAL
+    THREADED(threading[0]->n_core)
+      nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+    p->tol = (tol-g.norm_res)*nrhs/nrhs2/8.;
+    solver( );
+
     // DDalphaAMG_change_mu_sign( &tmp_status );
     warning0("sign of mu changed during the inversion of squared operator\n");
     break;
 
   case _PRECOND :
-#pragma omp parallel num_threads(threading[0]->n_core)
+    THREADED(threading[0]->n_core)
     preconditioner( sol, NULL, rhs, _NO_RES, &l, threading[omp_get_thread_num()] );
     break;
 
   case _OPERATOR :
-#pragma omp parallel num_threads(threading[0]->n_core)
+    THREADED(threading[0]->n_core)
     if ( g.mixed_precision == 2 ) {
       apply_operator_double( sol, rhs, &(g.p_MP.dp), &l, threading[omp_get_thread_num()] );
     } else {
@@ -786,52 +944,538 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status
     break;
   }
 
-  /*
+  for (t=0, j=0; t<ll[T]; t++) {
+    if (g.bc==_TWISTED) phase[T] = g.twisted_bc[T]*((double)sl[T]+t)/(double)gl[T];
+    for (z=0; z<ll[Z]; z++) {
+      if (g.bc==_TWISTED) phase[Z] = phase[T] + g.twisted_bc[Z]*((double)sl[Z]+z)/(double)gl[Z];
+      for (y=0; y<ll[Y]; y++) {
+        if (g.bc==_TWISTED) phase[Y] = phase[Z] + g.twisted_bc[Y]*((double)sl[Y]+y)/(double)gl[Y];
+        for (x=0; x<ll[X]; x++) {
+          if (g.bc==_TWISTED) {
+            phase[X] = phase[Y] + g.twisted_bc[X]*((double)sl[X]+x)/(double)gl[X];
+            twisted_bc = cexp(-I*phase[X]);
+          } else
+            twisted_bc = 1.;
+          if(vector_index_fct!=NULL )
+            i = vector_index_fct( t, z, y, x );
+          else 
+            i = 2*j;
+
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) {
+            for ( mu=0; mu<4; mu++ ) {
+              for ( k=0; k<3; k++, j++ ) {
+                tmp1 = sol[j] * twisted_bc;
+                tmp2 = sol[j+6] * twisted_bc;
+#ifndef BASIS4 
+                vector1_out[i+2*(k+3*mu)]   = creal(tmp1);
+                vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1);
+                vector2_out[i+2*(k+3*mu)]   = creal(tmp2);
+                vector2_out[i+2*(k+3*mu)+1] = cimag(tmp2);
+#else
+                vector1_out[i+2*(k+3*(3-mu))]   = creal(tmp1);
+                vector1_out[i+2*(k+3*(3-mu))+1] = cimag(tmp1);
+                vector2_out[i+2*(k+3*(3-mu))]   = creal(tmp2);
+                vector2_out[i+2*(k+3*(3-mu))+1] = cimag(tmp2);
+#endif   
+              }
+              if(mu%2)
+                j+=6;
+            }
+          } else
+#endif
+            for ( mu=0; mu<4; mu++ )
+              for ( k=0; k<3; k++, j++ ) {
+                tmp1 = sol[j] * twisted_bc;
+#ifndef BASIS4 
+                vector1_out[i+2*(k+3*mu)]   = creal(tmp1);
+                vector1_out[i+2*(k+3*mu)+1] = cimag(tmp1);
+#else
+                vector1_out[i+2*(k+3*(3-mu))]   = creal(tmp1);
+                vector1_out[i+2*(k+3*(3-mu))+1] = cimag(tmp1);
+#endif   
+              }
+        }
+      }
+    }
+  }
+    
 #ifndef INIT_ONE_PREC
-  if ( g.mixed_precision==2 || vector_index_fct!=NULL || g.bc==_TWISTED)
+  if (precision_changed) {
+    g.mixed_precision=2;
+    // recovering pointer from x and b
+    p->b = vb; 
+    p->x = vx;
+  }
+#endif
+    
+  if ( g.norm_res <= tol || _TYPE == _OPERATOR || _TYPE == _PRECOND )
+    mg_status->success = 1;
+  mg_status->info = g.norm_res;
+  t1 = MPI_Wtime();
+  mg_status->time = t1-t0;
+  mg_status->coarse_time = g.coarse_time;
+  mg_status->iter_count = g.iter_count;
+  mg_status->coarse_iter_count = g.coarse_iter_count;
+  
+}
+
+static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_in, 
+                                         double **vector2_out, double *vector2_in, 
+                                         double  *even_shifts, double *odd_shifts, int n_shifts,
+                                         double  *tol, DDalphaAMG_status *mg_status, int _TYPE ) 
+{
+  int t, z, y, x, i, j, k, n, mu, *ll = l.local_lattice, *gl=l.global_lattice, sl[4], precision_changed;
+  complex_double twisted_bc, tmp1, tmp2;
+  double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO},
+    vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2;
+  gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p);
+  vector_double vb, rhs = p->b;
+  vector_double vx, sol = p->x;
+  vector_double source = NULL, solution = NULL, solution2 = NULL;
+  DDalphaAMG_status tmp_status;
+
+  double t0, t1;
+  t0 = MPI_Wtime();
+  g.coarse_time = 0;
+  g.iter_count = 0;
+  g.coarse_iter_count = 0;
+  mg_status->success = 0;
+  mg_status->info = 0;
+  
+  ASSERT(vector1_out!=NULL);
+  ASSERT(vector1_in!=NULL);
+  ASSERT(tol!=NULL);
+#ifdef HAVE_TM1p1
+  if(g.n_flavours==2) {
+    ASSERT(vector2_out!=NULL);
+    ASSERT(vector2_in!=NULL);
+  }
+#endif
+
+  if(g.mixed_precision!=2)
+    g.p.tol = tol[0];
+  else
+    g.p_MP.dp.tol = tol[0];
+
+  for (i=0; i<4; i++)
+    sl[i] = ll[i]*g.my_coords[i];
+ 
+  for (t=0, j=0; t<ll[T]; t++) {
+    if (g.bc==_TWISTED) phase[T] = g.twisted_bc[T]*((double)sl[T]+t)/(double)gl[T];
+    for (z=0; z<ll[Z]; z++) {
+      if (g.bc==_TWISTED) phase[Z] = phase[T] + g.twisted_bc[Z]*((double)sl[Z]+z)/(double)gl[Z];
+      for (y=0; y<ll[Y]; y++) {
+        if (g.bc==_TWISTED) phase[Y] = phase[Z] + g.twisted_bc[Y]*((double)sl[Y]+y)/(double)gl[Y];
+        for (x=0; x<ll[X]; x++) {
+          if (g.bc==_TWISTED) {
+            phase[X] = phase[Y] + g.twisted_bc[X]*((double)sl[X]+x)/(double)gl[X];
+            twisted_bc = cexp(I*phase[X]);
+          } else
+            twisted_bc = 1.;
+          if(vector_index_fct!=NULL )
+            i = vector_index_fct( t, z, y, x );
+          else 
+            i = 2*j;
+          
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) {
+            for ( mu=0; mu<4; mu++ ) {
+              for ( k=0; k<3; k++, j++ ) {
+#ifndef BASIS4 
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
+                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*mu)] + I*(complex_double)vector2_in[i+2*(k+3*mu)+1]) * twisted_bc;
+
+#else
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+                rhs[j+6] = ((complex_double)vector2_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector2_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+#endif
+                
+#ifndef INIT_ONE_PREC
+                if(g.mixed_precision==2) {
+                  vtmp=cabs(rhs[j]);
+                  if(vtmp > vmax)
+                    vmax=vtmp;
+                  if( vtmp > EPS_double && vtmp < vmin )
+                    vmin=vtmp;
+                  vtmp=cabs(rhs[j+6]);
+                  if(vtmp > vmax)
+                    vmax=vtmp;
+                  if( vtmp > EPS_double && vtmp < vmin )
+                    vmin=vtmp;
+                }
+              }
+#endif
+              if(mu%2)
+                j+=6;
+            }
+          } else
+#endif
+            for ( mu=0; mu<4; mu++ )
+              for ( k=0; k<3; k++, j++ ) {
+#ifndef BASIS4 
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc;
 #else
-  if ( vector_index_fct!=NULL || g.bc==_TWISTED)
+                rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc;
+#endif
+                
+#ifndef INIT_ONE_PREC
+                if( g.mixed_precision == 2 ) {
+                  vtmp = cabs(rhs[j]);
+                  if(vtmp > vmax)
+                    vmax = vtmp;
+                  if( vtmp > EPS_double && vtmp < vmin )
+                    vmin = vtmp;
+                }
+              }
+#endif
+        }
+      }
+    }
+  }
+   
+#ifndef INIT_ONE_PREC
+    
+  double gvmin, gvmax;
+  if( g.mixed_precision == 2 ) {
+    MPI_Allreduce(&vmin, &gvmin, 1, MPI_DOUBLE, MPI_MIN, g.comm_cart);
+    MPI_Allreduce(&vmax, &gvmax, 1, MPI_DOUBLE, MPI_MAX, g.comm_cart);
+  }
+  
+  //switching to double precision on the fine level
+  if(g.mixed_precision==2 && gvmin/gvmax<EPS_float) {
+    warning0("Changing solver precision on fine level due to rhs elements (min/max=%e)\n", vmin/vmax);
+    precision_changed=1;
+    g.mixed_precision=1;
+    p = &(g.p);
+    // storing pointer in x and b
+    vb = p->b; 
+    vx = p->x;
+    p->b = g.p_MP.dp.b;
+    p->x = g.p_MP.dp.x;
+    p->tol = g.p_MP.dp.tol;
+  } else precision_changed = 0;
+#endif
+
+  if ( n_shifts > 0 ) {
+    ASSERT( even_shifts != NULL );
+    ASSERT( odd_shifts != NULL );
+  }
+  if ( n_shifts > 1 ) {
+    MALLOC( source, complex_double, l.inner_vector_size );
+    MALLOC( solution, complex_double, l.inner_vector_size );
+    if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN )
+      MALLOC( solution2, complex_double, l.inner_vector_size );
+  }
+  
+  for ( n = 0; n < n_shifts; n++ ) {
+    
+#ifdef HAVE_TM1p1
+    if(g.n_flavours==2) {
+      if( g.epsbar_ig5_even_shift != even_shifts[n] || g.epsbar_ig5_odd_shift != odd_shifts[n] ) {
+        g.epsbar_ig5_even_shift = even_shifts[n];
+        g.epsbar_ig5_odd_shift  =  odd_shifts[n];
+        THREADED(threading[0]->n_core)
+          epsbar_term_update( &l, threading[omp_get_thread_num()] );
+        THREADED(threading[0]->n_core)
+          finalize_operator_update( &l, threading[omp_get_thread_num()]);
+      }        
+    } else
+#endif
+      {
+#ifdef HAVE_TM
+        if( g.mu_even_shift != even_shifts[n] || g.mu_odd_shift != odd_shifts[n] ) {
+          g.mu_even_shift = even_shifts[n];
+          g.mu_odd_shift  =  odd_shifts[n];
+          THREADED(threading[0]->n_core)
+            tm_term_update( g.mu, &l, threading[omp_get_thread_num()] );
+          THREADED(threading[0]->n_core)
+            finalize_operator_update( &l, threading[omp_get_thread_num()]);
+        }
+#endif
+      }
+
+    p->tol = tol[n];
+
+    switch(_TYPE) {
+      
+    case _SOLVE :
+      if ( n ) {
+        vector_copy( rhs, source );
+        p->initial_guess_zero = 0;
+      } else if ( n_shifts > 1 )
+        vector_copy( source, rhs );
+      
+      solver( );
+      break;
+
+      
+    case _SOLVE_SQ :
+      if ( n ) {
+        vector_copy( rhs, source );
+        p->initial_guess_zero = 0;
+      } else if ( n_shifts > 1 ) {
+        THREADED(threading[0]->n_core) 
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) 
+            // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs
+            tau1_gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] );
+          else
+#endif
+            // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
+            gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] );
+        vector_copy( source, rhs );
+      }
+
+      if( n )
+        correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
+      // read NOTE RESIDUAL
+      THREADED(threading[0]->n_core)
+        nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+      p->tol = tol[n]/2.;
+      solver( );
+      if ( n < n_shifts-1 ) 
+        vector_copy( solution, sol );
+        
+      THREADED(threading[0]->n_core) 
+#ifdef HAVE_TM1p1
+        if(g.n_flavours==2) 
+          tau1_gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] );
+        else
+#endif
+          gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] );
+
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        change_epsbar_shift_sign( );
+      else
+#endif
+        DDalphaAMG_change_mu_sign( &tmp_status );
+
+      if( n )
+        vector_copy( sol, solution2 );
+
+      // read NOTE RESIDUAL
+      THREADED(threading[0]->n_core)
+        nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+      p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.;
+      solver( );
+      if ( n < n_shifts-1 ) 
+        vector_copy( solution2, sol );
+     
+      // DDalphaAMG_change_mu_sign( &tmp_status );
+      warning0("sign of mu changed during the inversion of squared operator\n");
+      break;
+      
+
+    case _SOLVE_SQ_ODD :    
+      if ( n ) {
+        vector_copy( rhs, source );
+        p->initial_guess_zero = 0;
+      } else if ( n_shifts > 1 ) {
+        THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) 
+            // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs
+            tau1_gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+          else
 #endif
-  */
+            // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
+            gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+       
+        vector_copy( source, rhs );
+      }
+
+      if( n )
+        correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
+
+      // read NOTE RESIDUAL
+      THREADED(threading[0]->n_core)
+        nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+      p->tol = tol[n]/2.;
+      solver( );
+      if ( n < n_shifts-1 ) 
+        vector_copy( solution, sol );
+
+      THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) 
+            tau1_gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+          else
+#endif
+            gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        change_epsbar_shift_sign( );
+      else
+#endif
+        DDalphaAMG_change_mu_sign( &tmp_status );
+
+      if( n )
+        vector_copy( sol, solution2 );
+
+      // read NOTE RESIDUAL
+      THREADED(threading[0]->n_core)
+        nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+      p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.;
+      solver( );
+      if ( n < n_shifts-1 ) 
+        vector_copy( solution2, sol );
+
+      // DDalphaAMG_change_mu_sign( &tmp_status );
+      warning0("sign of mu changed during the inversion of squared operator\n");
+      break;
+      
+
+    case _SOLVE_SQ_EVEN :    
+      if ( n ) {
+        vector_copy( rhs, source );
+        p->initial_guess_zero = 0;
+      } else if ( n_shifts > 1 ) {
+        THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+          if(g.n_flavours==2) 
+            // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs
+            tau1_gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+          else
+#endif
+            // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs
+            gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]);
+
+        vector_copy( source, rhs );
+      }
+
+      if( n )
+        correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]);
+
+      // read NOTE RESIDUAL
+      THREADED(threading[0]->n_core)
+        nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+      p->tol = tol[n]/2.;
+      solver( );
+      if ( n < n_shifts-1 ) 
+        vector_copy( solution, sol );
+
+      THREADED(threading[0]->n_core)
+#ifdef HAVE_TM1p1
+        if(g.n_flavours==2) 
+          tau1_gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+        else
+#endif
+          gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]);
+       
+#ifdef HAVE_TM1p1
+      if(g.n_flavours==2) 
+        change_epsbar_shift_sign( );
+      else
+#endif
+        DDalphaAMG_change_mu_sign( &tmp_status );
+      
+      if( n )
+        vector_copy( sol, solution2 );
+      // read NOTE RESIDUAL
+      THREADED(threading[0]->n_core)
+        nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] );
+      p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.;
+      solver( );
+      if ( n < n_shifts-1 ) 
+        vector_copy( solution2, sol );
+
+      // DDalphaAMG_change_mu_sign( &tmp_status );
+      warning0("sign of mu changed during the inversion of squared operator\n");
+      break;
+      
+
+    case _PRECOND :
+      THREADED(threading[0]->n_core)
+        preconditioner( sol, NULL, rhs, _NO_RES, &l, threading[omp_get_thread_num()] );
+      break;
+
+      
+    case _OPERATOR :
+      THREADED(threading[0]->n_core)
+        if ( g.mixed_precision == 2 ) {
+          apply_operator_double( sol, rhs, &(g.p_MP.dp), &l, threading[omp_get_thread_num()] );
+        } else {
+          apply_operator_double( sol, rhs, &(g.p), &l, threading[omp_get_thread_num()] );
+        }
+      break;
+
+      
+    default :
+      warning0("_TYPE not found in DDalphaAMG_driver. Returing vector in as vector out.");
+      sol=rhs;
+      break;
+    }
+    
     for (t=0, j=0; t<ll[T]; t++) {
       if (g.bc==_TWISTED) phase[T] = g.twisted_bc[T]*((double)sl[T]+t)/(double)gl[T];
       for (z=0; z<ll[Z]; z++) {
-	if (g.bc==_TWISTED) phase[Z] = phase[T] + g.twisted_bc[Z]*((double)sl[Z]+z)/(double)gl[Z];
-	for (y=0; y<ll[Y]; y++) {
-	  if (g.bc==_TWISTED) phase[Y] = phase[Z] + g.twisted_bc[Y]*((double)sl[Y]+y)/(double)gl[Y];
-	  for (x=0; x<ll[X]; x++) {
-	    if (g.bc==_TWISTED) {
-	      phase[X] = phase[Y] + g.twisted_bc[X]*((double)sl[X]+x)/(double)gl[X];
-	      twisted_bc = cexp(-I*phase[X]);
-	    } else
-	      twisted_bc = 1.;
-	    if(vector_index_fct!=NULL )
-	      i = vector_index_fct( t, z, y, x );
-	    else 
-	      i = 2*j;
-	    
-	    for ( mu=0; mu<4; mu++ )
-	      for ( k=0; k<3; k++, j++ ){
-		tmp = sol[j] * twisted_bc;
+        if (g.bc==_TWISTED) phase[Z] = phase[T] + g.twisted_bc[Z]*((double)sl[Z]+z)/(double)gl[Z];
+        for (y=0; y<ll[Y]; y++) {
+          if (g.bc==_TWISTED) phase[Y] = phase[Z] + g.twisted_bc[Y]*((double)sl[Y]+y)/(double)gl[Y];
+          for (x=0; x<ll[X]; x++) {
+            if (g.bc==_TWISTED) {
+              phase[X] = phase[Y] + g.twisted_bc[X]*((double)sl[X]+x)/(double)gl[X];
+              twisted_bc = cexp(-I*phase[X]);
+            } else
+              twisted_bc = 1.;
+            if(vector_index_fct!=NULL )
+              i = vector_index_fct( t, z, y, x );
+            else 
+              i = 2*j;
+            
+#ifdef HAVE_TM1p1
+            if(g.n_flavours==2) {
+              for ( mu=0; mu<4; mu++ ) {
+                for ( k=0; k<3; k++, j++ ) {
+                  tmp1 = sol[j] * twisted_bc;
+                  tmp2 = sol[j+6] * twisted_bc;
 #ifndef BASIS4 
-		vector_out[i+2*(k+3*mu)] = creal(tmp);
-		vector_out[i+2*(k+3*mu)+1] = cimag(tmp);
+                  vector1_out[n][i+2*(k+3*mu)]   = creal(tmp1);
+                  vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1);
+                  vector2_out[n][i+2*(k+3*mu)]   = creal(tmp2);
+                  vector2_out[n][i+2*(k+3*mu)+1] = cimag(tmp2);
 #else
-		vector_out[i+2*(k+3*(3-mu))] = creal(tmp);
-		vector_out[i+2*(k+3*(3-mu))+1] = cimag(tmp);
-#endif	 
-	      }
-	  }
-	}
+                  vector1_out[n][i+2*(k+3*(3-mu))]   = creal(tmp1);
+                  vector1_out[n][i+2*(k+3*(3-mu))+1] = cimag(tmp1);
+                  vector2_out[n][i+2*(k+3*(3-mu))]   = creal(tmp2);
+                  vector2_out[n][i+2*(k+3*(3-mu))+1] = cimag(tmp2);
+#endif   
+                }
+                if(mu%2)
+                  j+=6;
+              }
+            } else
+#endif
+              for ( mu=0; mu<4; mu++ )
+                for ( k=0; k<3; k++, j++ ) {
+                  tmp1 = sol[j] * twisted_bc;
+#ifndef BASIS4 
+                  vector1_out[n][i+2*(k+3*mu)]   = creal(tmp1);
+                  vector1_out[n][i+2*(k+3*mu)+1] = cimag(tmp1);
+#else
+                  vector1_out[n][i+2*(k+3*(3-mu))]   = creal(tmp1);
+                  vector1_out[n][i+2*(k+3*(3-mu))+1] = cimag(tmp1);
+#endif   
+                }
+          }
+        }
       }
     }
-    /*
-  else {
-    p->b = rhs;
-    p->x = sol;
+    
+  }
+
+  p->initial_guess_zero = 1;
+  if ( n_shifts > 0 ) {
+    FREE( source, complex_double, l.inner_vector_size );
+    FREE( solution, complex_double, l.inner_vector_size );
+    if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN )
+      FREE( solution2, complex_double, l.inner_vector_size );
   }
-    */
 
+  
 #ifndef INIT_ONE_PREC
   if (precision_changed) {
     g.mixed_precision=2;
@@ -840,95 +1484,152 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status
     p->x = vx;
   }
 #endif
-  
+    
+  if ( g.norm_res <= p->tol || _TYPE == _OPERATOR || _TYPE == _PRECOND )
+    mg_status->success = 1;
   mg_status->info = g.norm_res;
   t1 = MPI_Wtime();
   mg_status->time = t1-t0;
   mg_status->coarse_time = g.coarse_time;
   mg_status->iter_count = g.iter_count;
   mg_status->coarse_iter_count = g.coarse_iter_count;
-   
+  
+}
+
+static inline void set_n_flavours( int n) {
+
+#ifdef HAVE_TM1p1
+  THREADED(threading[0]->n_core)
+    data_layout_n_flavours( n, &l, threading[omp_get_thread_num()] );
+#else
+  if( n==2 )
+      error0("For DDalphaAMG_solve_doublet_*, HAVE_TM1p1 flag required\n");
+#endif
+    
 }
 
 void DDalphaAMG_solve( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status )
 {
-  
-  if(g.mixed_precision!=2) {
-    g.p.tol = tol;
-  }
-  else {
-    g.p_MP.dp.tol = tol;
-  }
-
-  DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE );
+  DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE );
+}
 
-  if ( g.norm_res <= tol )
-    mg_status->success = 1;
+void DDalphaAMG_solve_doublet( double *vector1_out, double *vector1_in,
+                               double *vector2_out, double *vector2_in,
+                               double tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE );
+  set_n_flavours( 1 );
+}
 
+void DDalphaAMG_solve_ms_doublet( double **vector1_out, double *vector1_in,
+                                  double **vector2_out, double *vector2_in,
+                                  double  *even_shifts, double *odd_shifts, int n_shifts,
+                                  double  *tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts,
+                        tol, mg_status, _SOLVE );
+  set_n_flavours( 1 );
 }
 
 void DDalphaAMG_solve_squared( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status )
 {
+  DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE_SQ );
+}
 
-  if(g.mixed_precision!=2) {
-    g.p.tol = tol;
-  }
-  else {
-    g.p_MP.dp.tol = tol;
-  }
-  
-  DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE_SQ );
-  
-  if ( g.norm_res <= tol )
-    mg_status->success = 1;
+void DDalphaAMG_solve_doublet_squared( double *vector1_out, double *vector1_in,
+                                       double *vector2_out, double *vector2_in,
+                                       double tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE_SQ );
+  set_n_flavours( 1 );
+}
 
+void DDalphaAMG_solve_ms_doublet_squared( double **vector1_out, double *vector1_in,
+                                          double **vector2_out, double *vector2_in,
+                                          double  *even_shifts, double *odd_shifts, int n_shifts,
+                                          double  *tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts,
+                        tol, mg_status, _SOLVE_SQ );
+  set_n_flavours( 1 );
 }
 
 void DDalphaAMG_solve_squared_odd( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status )
 {
+  DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE_SQ_ODD );
+}
 
-  if(g.mixed_precision!=2) {
-    g.p.tol = tol;
-  }
-  else {
-    g.p_MP.dp.tol = tol;
-  }
-  
-  DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE_SQ_ODD );
-  
-  if ( g.norm_res <= tol )
-    mg_status->success = 1;
+void DDalphaAMG_solve_doublet_squared_odd( double *vector1_out, double *vector1_in,
+                                           double *vector2_out, double *vector2_in,
+                                           double tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE_SQ_ODD );
+  set_n_flavours( 1 );
+}
+
+void DDalphaAMG_solve_ms_doublet_squared_odd( double **vector1_out, double *vector1_in,
+                                              double **vector2_out, double *vector2_in,
+                                              double  *even_shifts, double *odd_shifts, int n_shifts,
+                                              double  *tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts,
+                        tol, mg_status, _SOLVE_SQ_ODD );
+  set_n_flavours( 1 );
 }
 
 void DDalphaAMG_solve_squared_even( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status )
 {
+  DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE_SQ_EVEN );
+}
 
-  if(g.mixed_precision!=2) {
-    g.p.tol = tol;
-  }
-  else {
-    g.p_MP.dp.tol = tol;
-  }
-  
-  DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE_SQ_EVEN );
-  
-  if ( g.norm_res <= tol )
-    mg_status->success = 1;
+void DDalphaAMG_solve_doublet_squared_even( double *vector1_out, double *vector1_in,
+                                            double *vector2_out, double *vector2_in,
+                                            double tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE_SQ_EVEN );
+  set_n_flavours( 1 );
 }
 
+void DDalphaAMG_solve_ms_doublet_squared_even( double **vector1_out, double *vector1_in,
+                                               double **vector2_out, double *vector2_in,
+                                               double  *even_shifts, double *odd_shifts, int n_shifts,
+                                               double  *tol, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts,
+                        tol, mg_status, _SOLVE_SQ_EVEN );
+  set_n_flavours( 1 );
+}
 
 void DDalphaAMG_apply_operator( double *vector_out, double *vector_in, DDalphaAMG_status *mg_status ) {
-  
-  DDalphaAMG_driver( vector_out, vector_in, mg_status, _OPERATOR );
-  
-  mg_status->success = 1;
+  DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, 0, mg_status, _OPERATOR );
+}
+
+void DDalphaAMG_apply_operator_doublet( double *vector1_out, double *vector1_in,
+                                        double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, 0, mg_status, _OPERATOR );
+  set_n_flavours( 1 );
 }
 
 void DDalphaAMG_preconditioner( double *vector_out, double *vector_in, DDalphaAMG_status * mg_status ) {
+  DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, 0, mg_status, _PRECOND );
+}
 
-  DDalphaAMG_driver( vector_out, vector_in, mg_status, _PRECOND );
-  
-  mg_status->success = 1;
+void DDalphaAMG_preconditioner_doublet( double *vector1_out, double *vector1_in,
+                                        double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status )
+{
+  set_n_flavours( 2 );
+  DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, 0, mg_status, _PRECOND );
+  set_n_flavours( 1 );
 }
 
 void DDalphaAMG_free( void ) {
@@ -949,7 +1650,7 @@ void DDalphaAMG_finalize( void ) {
   if (g.setup_flag)
     method_free( &l );
   method_finalize( &l );
-
+  
 }
 
 MPI_Comm DDalphaAMG_get_communicator( void ){
@@ -989,11 +1690,14 @@ void DDalphaAMG_write_vector( double *vector_out, char *filename, int format, DD
 
 void DDalphaAMG_define_vector_const( double *vector, double re, double im ) {
 
-#pragma omp parallel num_threads(threading[0]->n_core)
+  THREADED(threading[0]->n_core)
   if(vector!=NULL){
-    int start, end;
-    compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]);
-    vector_double_define( (vector_double) vector, re+I*im, start, end, &l );
+    if ( re && im )
+      vector_double_define( (vector_double) vector, re+I*im, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
+    else if ( re )
+      vector_double_define_real( (vector_double) vector, re, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
+    else
+      vector_double_define_zero( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
   }
   else {
     warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
@@ -1002,14 +1706,12 @@ void DDalphaAMG_define_vector_const( double *vector, double re, double im ) {
 
 void DDalphaAMG_define_vector_rand( double *vector ) {
 
-#pragma omp parallel num_threads(threading[0]->n_core)
+  THREADED(threading[0]->n_core)
   if(vector!=NULL){
-    int start, end;
-    compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]);
-    vector_double_define_random( (vector_double) vector, start, end, &l );
+    vector_double_define_random( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
   }
   else {
-    warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
+    warning0("Vector NULL when calling DDalphaAMG_define_vector_rand!");
   }
 
 }
@@ -1017,29 +1719,23 @@ void DDalphaAMG_define_vector_rand( double *vector ) {
 double DDalphaAMG_vector_norm( double *vector ) {
 
   double norm = 0;
-#pragma omp parallel num_threads(threading[0]->n_core)
-  if(vector!=NULL){
-    int start, end;
-    norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
-   }
-  else {
-    warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
-  }
+  if(vector!=NULL)
+    THREADED(threading[0]->n_core)
+      norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] );
+  else
+    warning0("Vector NULL when calling DDalphaAMG_define_vector_norm!");
 
   return norm;
 }
 
 void DDalphaAMG_vector_saxpy( double *vector_out, double a, double *x, double *y ) {
 
-  #pragma omp parallel num_threads(threading[0]->n_core)
-  if(vector_out!=NULL && x!=NULL && y!=NULL){
-    int start, end;
-    compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]);
-    vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, start, end, &l );
-  }
-  else {
-    warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
-  }
+  if(vector_out!=NULL && x!=NULL && y!=NULL)
+    THREADED(threading[0]->n_core)
+      vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, 0, 
+                           l.inner_vector_size, &l, threading[omp_get_thread_num()] );
+  else
+    warning0("Vector NULL when calling DDalphaAMG_define_vector_saxpy!");
 
 }
 
@@ -1049,7 +1745,7 @@ void DDalphaAMG_test_routine( DDalphaAMG_status *mg_status ) {
   t0 = MPI_Wtime();
 
   printf00("\n");
-#pragma omp parallel num_threads(threading[0]->n_core)
+  THREADED(threading[0]->n_core)
   test_routine( &l, threading[omp_get_thread_num()]);
 
   if (g.test < 1e-5)
@@ -1073,14 +1769,29 @@ void DDalphaAMG_get_parameters( DDalphaAMG_parameters *mg_params ){
   mg_params->mixed_precision = g.mixed_precision;
   mg_params->kcycle_tolerance = g.kcycle_tol;
   mg_params->coarse_tolerance = g.coarse_tol;
+  mg_params->smoother_iterations = g.post_smooth_iter[0];
   mg_params->conf_index_fct = conf_index_fct;
   mg_params->vector_index_fct = vector_index_fct;
-  mg_params->kappa = 0.5/(l.real_shift + 4.);
-  mg_params->mu = g.tm_mu;
-  mg_params->mu_odd_shift = g.tm_mu_odd_shift;
-  mg_params->mu_even_shift = g.tm_mu_even_shift;
+  mg_params->kappa = 0.5/(g.m0 + 4.);
+#ifdef HAVE_TM
+  mg_params->mu = g.mu;
+  mg_params->mu_odd_shift = g.mu_odd_shift;
+  mg_params->mu_even_shift = g.mu_even_shift;
+#else
+  mg_params->mu = 0;
+  mg_params->mu_odd_shift = 0;
+  mg_params->mu_even_shift = 0;
+#endif
+#ifdef HAVE_TM1p1
+  mg_params->epsbar = g.epsbar;
+  mg_params->epsbar_ig5_odd_shift = g.epsbar_ig5_odd_shift;
+  mg_params->epsbar_ig5_even_shift = g.epsbar_ig5_even_shift;
+#else
+  mg_params->epsbar = 0;
+  mg_params->epsbar_ig5_odd_shift = 0;
+  mg_params->epsbar_ig5_even_shift = 0;
+#endif
   mg_params->print = g.print;
-  mg_params->smoother_iterations = g.post_smooth_iter[0];
   
   for( i=0; i<g.num_levels; i++ ) {
     for( j=0; j<4; j++ )
@@ -1088,6 +1799,15 @@ void DDalphaAMG_get_parameters( DDalphaAMG_parameters *mg_params ){
     if( i<g.num_levels-1 )
       mg_params->mg_basis_vectors[i] = g.num_eig_vect[i];
     mg_params->setup_iterations[i] = g.setup_iter[i];
-    mg_params->mu_factor[i] = g.tm_mu_factor[i];
+#ifdef HAVE_TM
+    mg_params->mu_factor[i] = g.mu_factor[i];
+#else
+    mg_params->mu_factor[i] = 1;
+#endif
+#ifdef HAVE_TM1p1
+    mg_params->epsbar_factor[i] = g.epsbar_factor[i];
+#else
+    mg_params->epsbar_factor[i] = 1;
+#endif
   }  
 }
diff --git a/src/blas_vectorized.h b/src/blas_vectorized.h
deleted file mode 100644
index 2afa928..0000000
--- a/src/blas_vectorized.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef BLAS_VECTORIZED_H
-#define BLAS_VECTORIZED_H
-
-// BLAS naming convention: LDA = leading dimension of A
-#ifdef SSE
-#include "sse_blas_vectorized.h"
-#endif
-
-// C=A*B+C
-static inline void cgemv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C)
-{
-#ifdef SSE
-  sse_cgemv( N, A, lda, B, C );
-#endif
-}
-
-// C=-A*B+C
-static inline void cgenmv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C)
-{
-#ifdef SSE
-  sse_cgenmv( N, A, lda, B, C );
-#endif
-}
-
-
-static inline void cgem_inverse(const int N, OPERATOR_TYPE_float *A_inverse, OPERATOR_TYPE_float *A, int lda)
-{
-#ifdef SSE
-  sse_cgem_inverse( N, A_inverse, A, lda );
-#endif
-}
-
-#endif // BLAS_VECTORIZED_H
diff --git a/src/clifford.h b/src/clifford.h
index 6521566..683b062 100644
--- a/src/clifford.h
+++ b/src/clifford.h
@@ -653,14 +653,27 @@
     #endif
   #endif
 
-#ifdef SSE
 static const int gamma_co[4][4] = {
   {GAMMA_T_SPIN0_CO, GAMMA_T_SPIN1_CO, GAMMA_T_SPIN2_CO, GAMMA_T_SPIN3_CO},
   {GAMMA_Z_SPIN0_CO, GAMMA_Z_SPIN1_CO, GAMMA_Z_SPIN2_CO, GAMMA_Z_SPIN3_CO},
   {GAMMA_Y_SPIN0_CO, GAMMA_Y_SPIN1_CO, GAMMA_Y_SPIN2_CO, GAMMA_Y_SPIN3_CO},
   {GAMMA_X_SPIN0_CO, GAMMA_X_SPIN1_CO, GAMMA_X_SPIN2_CO, GAMMA_X_SPIN3_CO}};
 
-static const double complex gamma_val[4][4] = {
+#ifdef HAVE_TM1p1
+static const int gamma_doublet_offset[4][4] = {
+  {GAMMA_T_SPIN0_CO/2, GAMMA_T_SPIN1_CO/2, GAMMA_T_SPIN2_CO/2, GAMMA_T_SPIN3_CO/2},
+  {GAMMA_Z_SPIN0_CO/2, GAMMA_Z_SPIN1_CO/2, GAMMA_Z_SPIN2_CO/2, GAMMA_Z_SPIN3_CO/2},
+  {GAMMA_Y_SPIN0_CO/2, GAMMA_Y_SPIN1_CO/2, GAMMA_Y_SPIN2_CO/2, GAMMA_Y_SPIN3_CO/2},
+  {GAMMA_X_SPIN0_CO/2, GAMMA_X_SPIN1_CO/2, GAMMA_X_SPIN2_CO/2, GAMMA_X_SPIN3_CO/2}};
+#endif
+
+static const complex_double gamma_val_double[4][4] = {
+  {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL},
+  {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL},
+  {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL},
+  {GAMMA_X_SPIN0_VAL, GAMMA_X_SPIN1_VAL, GAMMA_X_SPIN2_VAL, GAMMA_X_SPIN3_VAL}};
+
+static const complex_float gamma_val_float[4][4] = {
   {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL},
   {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL},
   {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL},
@@ -683,6 +696,5 @@ static const int gamma_im_sign[4][4] = {
   {GAMMA_Z_SPIN0_IM_SIGN,GAMMA_Z_SPIN1_IM_SIGN,GAMMA_Z_SPIN2_IM_SIGN,GAMMA_Z_SPIN3_IM_SIGN},
   {GAMMA_Y_SPIN0_IM_SIGN,GAMMA_Y_SPIN1_IM_SIGN,GAMMA_Y_SPIN2_IM_SIGN,GAMMA_Y_SPIN3_IM_SIGN},
   {GAMMA_X_SPIN0_IM_SIGN,GAMMA_X_SPIN1_IM_SIGN,GAMMA_X_SPIN2_IM_SIGN,GAMMA_X_SPIN3_IM_SIGN}};
-#endif
   
 #endif
diff --git a/src/coarse_coupling_generic.c b/src/coarse_coupling_generic.c
new file mode 100644
index 0000000..12bd8e5
--- /dev/null
+++ b/src/coarse_coupling_generic.c
@@ -0,0 +1,1369 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#include "main.h"
+
+#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
+void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ) {
+  
+  SYNC_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+
+  double t0, t1;
+  t0 = MPI_Wtime();
+
+  int mu, j, n = l->num_eig_vect, num_aggregates = l->is_PRECISION.num_agg,
+      aggregate_sites = l->num_inner_lattice_sites / num_aggregates,
+      clover_site_size = (l->num_eig_vect*(l->num_eig_vect*2+1)),
+      block_site_size = (l->num_eig_vect*(l->num_eig_vect+1)),
+      D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4,  // size of links in all 4 directions
+      fine_components = l->num_lattice_site_var;
+
+
+
+  START_LOCKED_MASTER(threading)
+  operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level );
+  END_LOCKED_MASTER(threading)
+  SYNC_HYPERTHREADS(threading)
+
+  // each thread loops overs its aggregates and then over internal d.o.f.
+  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
+    for ( j=0; j<D_link_size; j++ )
+      l->next_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO;
+    for ( j=0; j<clover_site_size; j++ )
+      l->next_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO;
+    for ( j=0; j<block_site_size; j++ )
+      l->next_level->op_PRECISION.odd_proj[j+a*block_site_size] = _COMPLEX_PRECISION_ZERO;
+  }
+
+  complex_PRECISION *mpi_buffer = NULL;
+  START_MASTER(threading)
+  MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 );
+  END_MASTER(threading)
+
+  int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]];
+
+  // set up table for direction flags
+  int *flags = direction_flags;
+  if(l->depth == 0) {
+    // even sites
+    for(int t=0; t < l->block_lattice[T]; t++) {
+      for(int z=0; z < l->block_lattice[Z]; z++) {
+        for(int y=0; y < l->block_lattice[Y]; y++) {
+          for(int x=0; x < l->block_lattice[X]; x++) {
+            if((x+y+z+t)%2 == 0) {
+              flags[2*X+0] = (x ==                     0)?0:1;
+              flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
+              flags[2*Y+0] = (y ==                     0)?0:1;
+              flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
+              flags[2*Z+0] = (z ==                     0)?0:1;
+              flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
+              flags[2*T+0] = (t ==                     0)?0:1;
+              flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
+              flags += 8;
+            }
+          }
+        }
+      }
+    }
+    // odd sites
+    for(int t=0; t < l->block_lattice[T]; t++) {
+      for(int z=0; z < l->block_lattice[Z]; z++) {
+        for(int y=0; y < l->block_lattice[Y]; y++) {
+          for(int x=0; x < l->block_lattice[X]; x++) {
+            if((x+y+z+t)%2 == 1) {
+              flags[2*X+0] = (x ==                     0)?0:1;
+              flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
+              flags[2*Y+0] = (y ==                     0)?0:1;
+              flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
+              flags[2*Z+0] = (z ==                     0)?0:1;
+              flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
+              flags[2*T+0] = (t ==                     0)?0:1;
+              flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
+              flags += 8;
+            }
+          }
+        }
+      }
+    }
+    } else {
+    for(int t=0; t < l->block_lattice[T]; t++) {
+      for(int z=0; z < l->block_lattice[Z]; z++) {
+        for(int y=0; y < l->block_lattice[Y]; y++) {
+          for(int x=0; x < l->block_lattice[X]; x++) {
+            flags[2*X+0] = (x ==                     0)?0:1;
+            flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
+            flags[2*Y+0] = (y ==                     0)?0:1;
+            flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
+            flags[2*Z+0] = (z ==                     0)?0:1;
+            flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
+            flags[2*T+0] = (t ==                     0)?0:1;
+            flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
+            flags += 8;
+          }
+        }
+      }
+    }
+  }
+
+  complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
+  complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
+  complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
+
+  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
+
+    // new aggregate is starting, zero out tmp
+    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
+      tmp[i] = 0.0;
+
+    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
+      if(l->depth == 0) {
+        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
+          d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
+              operator+c*l->vector_size, &(l->s_PRECISION), l, site,
+              direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) );
+      } else {
+        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
+          coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
+              operator+c*l->vector_size, &(l->s_PRECISION), l, site,
+              direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) );
+      }
+      set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp );
+    }
+
+    // aggregate is done, finalize
+    set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp );
+
+  }
+
+
+  SYNC_HYPERTHREADS(threading)
+  START_LOCKED_MASTER(threading)
+  // neighbors
+  for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION ) {
+    for ( mu=0; mu<4; mu++ ) {
+      // determine start of buffer for this mu
+      int start = 0;
+      for ( int j=0; j<mu; j++ )
+        start += l->s_PRECISION.op.c.num_boundary_sites[2*j];
+
+      // update ghost cells of V[i]
+      negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l,
+          SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION );
+    }
+    for ( mu=0; mu<4; mu++ ) {
+      // finish updating ghostcells of V[i]
+      negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l );
+    }
+  }
+  END_LOCKED_MASTER(threading)
+  SYNC_HYPERTHREADS(threading)
+
+
+  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
+
+    // new aggregate is starting, zero out tmp
+    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
+      tmp[i] = 0.0;
+
+    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
+      for ( mu=0; mu<4; mu++ ) {
+        if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0)
+          continue;
+
+        if(l->depth == 0)
+          for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
+            d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
+                operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site );
+        else
+          for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
+            coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
+                operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site );
+        set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION );
+      }
+    }
+
+    // aggregate is done, finalize
+    for ( mu=0; mu<4; mu++ )
+      set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION );
+  }
+
+  SYNC_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+
+  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
+
+    // new aggregate is starting, zero out tmp
+    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
+      tmp[i] = 0.0;
+
+    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
+      if(l->depth == 0) {
+        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
+          diagonal_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
+                                                   operator+c*l->vector_size, &(l->s_PRECISION), l, site );
+      } else {
+        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
+          coarse_aggregate_block_diagonal_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
+              operator+c*l->vector_size, &(l->s_PRECISION), l, site );
+      }
+      set_coarse_block_diagonal_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp );
+    }
+
+    // aggregate is done, finalize
+    set_coarse_block_diagonal_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp );
+  }
+
+  SYNC_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+
+  coarse_operator_PRECISION_setup_finalize( l, threading );
+
+  START_MASTER(threading)
+  FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) );
+
+  t1 = MPI_Wtime();
+  if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 );
+  END_MASTER(threading)
+
+  SYNC_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+}
+#endif
+ 
+
+void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
+    complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
+
+  int k, m, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2,
+      offset = l->num_lattice_site_var/2;
+  PRECISION *spin_0_1_pt;
+  PRECISION *spin_2_3_pt;
+  PRECISION *interpolation_data;
+
+  int component_offset = SIMD_LENGTH_PRECISION;
+  int fine_components = l->num_lattice_site_var;
+
+  // U(x) = [ A B      , A=A*, D=D*, C = -B*
+  //          C D ]
+  // storage order: upper triangle of A, upper triangle of D, B, columnwise
+  // diagonal coupling
+  for ( int n=0; n<n_rhs; n++ ) {
+
+    int max = SIMD_LENGTH_PRECISION*((n+1+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+    int spin_offset = (n/SIMD_LENGTH_PRECISION)*2*offset*SIMD_LENGTH_PRECISION;
+    spin_0_1_pt = (PRECISION *)(spin_0_1 + spin_offset) + n%SIMD_LENGTH_PRECISION;
+    spin_2_3_pt = (PRECISION *)(spin_2_3 + spin_offset) + n%SIMD_LENGTH_PRECISION;
+
+
+    // index k used for vectorization
+    // original loop runs to k<=n, we must pad as usual to fill SIMD
+    for ( k=0; k<max; k+=SIMD_LENGTH_PRECISION ) {
+      mm_PRECISION buffer_re;
+      mm_PRECISION buffer_im;
+
+      // this are the packed indices, which we do not use in tmp
+      //k1 = (n*(n+1))/2;
+      //k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
+      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+
+      interpolation_data = (PRECISION *)(V + k*l->vector_size + fine_components*SIMD_LENGTH_PRECISION*site);
+
+      // A
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=0; m<offset; m++ ) {
+        // spin_0_1 is the same for all k => broadcast
+        mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+
+      // D
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=offset; m<2*offset; m++ ) {
+        // spin_2_3 is the same for all k => broadcast
+        mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+    }
+
+    // index k used for vectorization
+    for ( k=0; k<OPERATOR_COMPONENT_OFFSET_PRECISION; k+=SIMD_LENGTH_PRECISION ) {
+      mm_PRECISION buffer_re;
+      mm_PRECISION buffer_im;
+
+      // this are the packed indices, which we do not use in tmp
+      //k1 = component_offset*(num_eig_vect+1+n);
+      k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+
+      interpolation_data = (PRECISION *)(V + k*l->vector_size + fine_components*component_offset*site);
+
+      // B
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=0; m<offset; m++ ) {
+        // spin_2_3 is the same for all k => broadcast
+        mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+    }
+  }
+}
+
+
+void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
+    complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
+
+  int k, m, k1, k2, num_eig_vect = l->next_level->num_parent_eig_vect,
+      offset = l->num_parent_eig_vect;
+  PRECISION *spin_0_1_pt;
+  PRECISION *spin_2_3_pt;
+  PRECISION *interpolation_data;
+
+  int component_offset = SIMD_LENGTH_PRECISION;
+  int fine_components = l->num_lattice_site_var;
+
+  // U(x) = [ A 0      , A=A*, D=D*
+  //          0 D ]
+  // storage order: upper triangle of A, upper triangle of D, B, columnwise
+  // diagonal coupling
+  for ( int n=0; n<n_rhs; n++ ) {
+
+    int max = SIMD_LENGTH_PRECISION*((n+1+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+    spin_0_1_pt = (PRECISION *)(spin_0_1 + (n/SIMD_LENGTH_PRECISION)*2*offset*SIMD_LENGTH_PRECISION) + n%SIMD_LENGTH_PRECISION;
+    spin_2_3_pt = (PRECISION *)(spin_2_3 + (n/SIMD_LENGTH_PRECISION)*2*offset*SIMD_LENGTH_PRECISION) + n%SIMD_LENGTH_PRECISION;
+
+
+    // index k used for vectorization
+    // original loop runs to k<=n, we must pad as usual to fill SIMD
+    for ( k=0; k<max; k+=SIMD_LENGTH_PRECISION ) {
+      mm_PRECISION buffer_re;
+      mm_PRECISION buffer_im;
+
+      // this are the packed indices, which we do not use in tmp
+      //k1 = (n*(n+1))/2;
+      //k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
+      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+
+      interpolation_data = (PRECISION *)(V + k*l->vector_size + fine_components*SIMD_LENGTH_PRECISION*site);
+
+      // A
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=0; m<offset; m++ ) {
+        // spin_0_1 is the same for all k => broadcast
+        mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+
+      // D
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=offset; m<2*offset; m++ ) {
+        // spin_2_3 is the same for all k => broadcast
+        mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+    }
+  }
+}
+
+void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
+
+  int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
+      num_eig_vect = l->next_level->num_lattice_site_var/2,
+      aggregate_size = l->inner_vector_size / num_aggregates,
+      clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2;
+  int t1, t2;
+
+  config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover;
+
+  // just an abbreviation
+  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
+  int fine_components = l->num_lattice_site_var;
+
+  int aggregate = (fine_components*site)/aggregate_size;
+  clover_pt = clover + aggregate*clover_site_size;
+
+  // U(x) = [ A B      , A=A*, D=D*, C = -B*
+  //          C D ]
+  // storage order: upper triangle of A, upper triangle of D, B, columnwise
+  // diagonal coupling
+  for ( int n=0; n<n_rhs; n++ ) {
+
+    // index k used for vectorization
+    for ( k=0; k<=n; k++ ) {
+
+      k1 = (n*(n+1))/2;
+      k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
+      t1 = (n+0*num_eig_vect)*component_offset;
+      t2 = (n+1*num_eig_vect)*component_offset;
+
+      // A
+      clover_pt[ k1+k ] += ((PRECISION *)(tmp+t1))[k] + I * ((PRECISION *)(tmp+t1)+component_offset)[k];
+
+      // D
+      clover_pt[ k2+k ] += ((PRECISION *)(tmp+t2))[k] + I * ((PRECISION *)(tmp+t2)+component_offset)[k];
+    }
+
+    // index k used for vectorization
+    for ( k=0; k<num_eig_vect; k++ ) {
+
+      k1 = num_eig_vect*(num_eig_vect+1+n);
+      t1 = (n+2*num_eig_vect)*component_offset;
+
+      // B
+      clover_pt[ k1+k ] += ((PRECISION *)(tmp+t1))[k] + I * ((PRECISION *)(tmp+t1)+component_offset)[k];
+    }
+  }
+}
+
+
+void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, 
+                                                        complex_PRECISION *V, const int mu, level_struct *l, int site,
+                                                        const int n_rhs, complex_PRECISION *tmp ) {
+
+  int k, k1, k2, m, num_eig_vect = l->next_level->num_lattice_site_var/2,
+      offset = l->num_lattice_site_var/2;
+
+  PRECISION *spin_0_1_pt;
+  PRECISION *spin_2_3_pt;
+  PRECISION *interpolation_data;
+
+  int component_offset = SIMD_LENGTH_PRECISION;
+  int fine_components = l->num_lattice_site_var;
+
+  // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
+  //             C D ]                        -B*  D* ]
+  // storage order: A, C, B, D, each column wise
+  for ( int n=0; n<n_rhs; n++ ) {
+
+    spin_0_1_pt = (PRECISION *)(spin_0_1 + (n/SIMD_LENGTH_PRECISION)*2*offset*SIMD_LENGTH_PRECISION) + n%SIMD_LENGTH_PRECISION;
+    spin_2_3_pt = (PRECISION *)(spin_2_3 + (n/SIMD_LENGTH_PRECISION)*2*offset*SIMD_LENGTH_PRECISION) + n%SIMD_LENGTH_PRECISION;
+
+    // index k used for vectorization
+    for ( k=0; k<OPERATOR_COMPONENT_OFFSET_PRECISION; k+=SIMD_LENGTH_PRECISION ) {
+      mm_PRECISION buffer_re;
+      mm_PRECISION buffer_im;
+
+      interpolation_data = (PRECISION *)(V + k*l->vector_size + fine_components*component_offset*site);
+
+      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+
+      // A
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=0; m<offset; m++ ) {
+        // spin_0_1 is the same for all k => broadcast
+        mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+
+      // C
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=offset; m<2*offset; m++ ) {
+        // spin_0_1 is the same for all k => broadcast
+        mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+
+
+      k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+      k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION;
+
+      // B
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=0; m<offset; m++ ) {
+        // spin_2_3 is the same for all k => broadcast
+        mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+
+      // D
+      buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION);
+      for ( m=offset; m<2*offset; m++ ) {
+        // spin_2_3 is the same for all k => broadcast
+        mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]);
+        mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]);
+        mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset);
+        mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset);
+
+        cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
+      }
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re);
+      mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im);
+    }
+  }
+}
+
+
+void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site,
+                                                                 const int n_rhs, complex_PRECISION *tmp ) {
+
+  int k, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2,
+      D_link_size = num_eig_vect*num_eig_vect*4;
+  int t1, t2;
+
+  config_PRECISION D_pt, D = l->next_level->op_PRECISION.D;
+
+  // just an abbreviation
+  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
+  int fine_components = l->num_lattice_site_var;
+
+  int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg);
+  D_pt = D + (4*aggregate+mu)*D_link_size;
+
+  // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
+  //             C D ]                        -B*  D* ]
+  // storage order: A, C, B, D, each column wise
+  for ( int n=0; n<n_rhs; n++ ) {
+
+    // index k used for vectorization
+    for ( k=0; k<num_eig_vect; k++ ) {
+
+      k1 = (n+0*num_eig_vect)*num_eig_vect;
+      k2 = (n+1*num_eig_vect)*num_eig_vect;
+      t1 = (n+0*num_eig_vect)*component_offset;
+      t2 = (n+1*num_eig_vect)*component_offset;
+
+      
+      // A
+      D_pt[ k1+k ] += ((PRECISION *)(tmp+t1))[k] + I * ((PRECISION *)(tmp+t1)+component_offset)[k];
+
+      // C
+      D_pt[ k2+k ] += ((PRECISION *)(tmp+t2))[k] + I * ((PRECISION *)(tmp+t2)+component_offset)[k];
+
+
+      k1 = (n+2*num_eig_vect)*num_eig_vect;
+      k2 = (n+3*num_eig_vect)*num_eig_vect;
+      t1 = (n+2*num_eig_vect)*component_offset;
+      t2 = (n+3*num_eig_vect)*component_offset;
+
+      // B
+      D_pt[ k1+k ] += ((PRECISION *)(tmp+t1))[k] + I * ((PRECISION *)(tmp+t1)+component_offset)[k];
+
+      // D
+      D_pt[ k2+k ] += ((PRECISION *)(tmp+t2))[k] + I * ((PRECISION *)(tmp+t2)+component_offset)[k];
+    }
+  }
+}
+
+void set_coarse_block_diagonal_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
+
+  int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
+      num_eig_vect = l->next_level->num_parent_eig_vect,
+      aggregate_size = l->inner_vector_size / num_aggregates,
+      block_site_size = (l->next_level->num_parent_eig_vect*(l->next_level->num_parent_eig_vect+1));
+  int t1, t2;
+
+  config_PRECISION block_pt, block = l->next_level->op_PRECISION.odd_proj;
+
+  // just an abbreviation
+  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
+  int fine_components = l->num_lattice_site_var;
+
+  int aggregate = (fine_components*site)/aggregate_size;
+  block_pt = block + aggregate*block_site_size;
+
+  // U(x) = [ A 0      , A=A*, D=D*
+  //          0 D ]
+  // storage order: upper triangle of A, upper triangle of D, B, columnwise
+  // diagonal coupling
+  for ( int n=0; n<n_rhs; n++ ) {
+
+    // index k used for vectorization
+    for ( k=0; k<=n; k++ ) {
+
+      k1 = (n*(n+1))/2;
+      k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
+      t1 = (n+0*num_eig_vect)*component_offset;
+      t2 = (n+1*num_eig_vect)*component_offset;
+
+      // A
+      block_pt[ k1+k ] += ((PRECISION *)(tmp+t1))[k] + I * ((PRECISION *)(tmp+t1)+component_offset)[k];
+
+      // D
+      block_pt[ k2+k ] += ((PRECISION *)(tmp+t2))[k] + I * ((PRECISION *)(tmp+t2)+component_offset)[k];
+    }
+  }
+}
+
+void copy_coarse_operator_to_vectorized_layout_PRECISION( config_PRECISION D,
+                                                          OPERATOR_TYPE_PRECISION *D_vectorized, 
+                                                          int num_aggregates, int num_eig_vect) {
+
+  int vecs = num_eig_vect;
+  // in vectorized layout D is stored column wise, but not split into ABCD
+  // each column is padded, such that next column can also start at 64B boundary
+  int half_offset = SIMD_LENGTH_PRECISION*((vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int column_offset = 2*half_offset;
+  // offset between blocks in D
+  int block_offset = vecs*vecs;
+
+  PRECISION *out_tmp = D_vectorized;
+
+  // we zero out the padded area (o) to avoid potential PRECISIONing-point errors
+  // D_vectorized is
+  // AB
+  // oo
+  // CD
+  // oo
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<4*num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // A
+        out_tmp[(2*i+0)*column_offset + j] = creal(D[0*block_offset + i*vecs+j]);
+        out_tmp[(2*i+1)*column_offset + j] = cimag(D[0*block_offset + i*vecs+j]);
+        // C
+        out_tmp[(2*i+0)*column_offset + j + half_offset] = creal(D[1*block_offset + i*vecs+j]);
+        out_tmp[(2*i+1)*column_offset + j + half_offset] = cimag(D[1*block_offset + i*vecs+j]);
+      }
+      // zero
+      for(int j=vecs; j<half_offset; j++) {
+        out_tmp[(2*i+0)*column_offset + j] = 0.0;
+        out_tmp[(2*i+1)*column_offset + j] = 0.0;
+        out_tmp[(2*i+0)*column_offset + j + half_offset] = 0.0;
+        out_tmp[(2*i+1)*column_offset + j + half_offset] = 0.0;
+      }
+    }
+
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // B
+        out_tmp[(2*(i+vecs)+0)*column_offset + j] = creal(D[2*block_offset + i*vecs+j]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(D[2*block_offset + i*vecs+j]);
+        // D
+        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = creal(D[3*block_offset + i*vecs+j]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = cimag(D[3*block_offset + i*vecs+j]);
+      }
+      // zero
+      for(int j=vecs; j<half_offset; j++) {
+        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
+        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
+        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = 0.0;
+        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = 0.0;
+      }
+    }
+    D += 2*vecs*2*vecs;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*column_offset*2*vecs;
+  }
+}
+
+
+void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D,
+                                                                     OPERATOR_TYPE_PRECISION *D_vectorized,
+                                                                     int num_aggregates, int num_eig_vect) {
+
+  int vecs = num_eig_vect;
+  // in vectorized layout D is stored column wise, but not split into ABCD
+  // output is transposed
+  // each column is padded, such that the next column can also start at 64bit boundary
+  int half_offset = SIMD_LENGTH_PRECISION*((vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int column_offset = 2*half_offset;
+  // offset between blocks in D
+  int block_offset = vecs*vecs;
+
+  PRECISION *out_tmp = D_vectorized;
+
+  // we zero out the padded area to avoid potential PRECISIONing-point errors
+  // D_vectorized is
+  // A^T C^T
+  //  o   o
+  // B^T D^T
+  //  o   o
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<4*num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // A
+        out_tmp[(2*i+0)*column_offset + j] = creal(D[0*block_offset + j*vecs+i]);
+        out_tmp[(2*i+1)*column_offset + j] = -cimag(D[0*block_offset + j*vecs+i]);
+        // B
+        out_tmp[(2*i+0)*column_offset + j + half_offset] = -creal(D[2*block_offset + j*vecs+i]);
+        out_tmp[(2*i+1)*column_offset + j + half_offset] = cimag(D[2*block_offset + j*vecs+i]);
+      }
+      // zero
+      for(int j=vecs; j<half_offset; j++) {
+        out_tmp[(2*i+0)*column_offset + j] = 0.0;
+        out_tmp[(2*i+1)*column_offset + j] = 0.0;
+        out_tmp[(2*i+0)*column_offset + j + half_offset] = 0.0;
+        out_tmp[(2*i+1)*column_offset + j + half_offset] = 0.0;
+      }
+    }
+
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // C
+        out_tmp[(2*(i+vecs)+0)*column_offset + j] = -creal(D[1*block_offset + j*vecs+i]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(D[1*block_offset + j*vecs+i]);
+        // D
+        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = creal(D[3*block_offset + j*vecs+i]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = -cimag(D[3*block_offset + j*vecs+i]);
+      }
+      // zero
+      for(int j=vecs; j<half_offset; j++) {
+        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
+        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
+        out_tmp[(2*(i+vecs)+0)*column_offset + j + half_offset] = 0.0;
+        out_tmp[(2*(i+vecs)+1)*column_offset + j + half_offset] = 0.0;
+      }
+    }
+    D += 2*vecs*2*vecs;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*column_offset*2*vecs;
+  }
+}
+
+
+void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover,
+                                                                OPERATOR_TYPE_PRECISION *clover_vectorized,
+                                                                int num_aggregates, int num_eig_vect) {
+
+  int vecs = num_eig_vect;
+  // in vectorized layout clover is stored column wise, but not split into ABCD
+  // each column is padded, such that next column can also start at 64B boundary
+  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  // offset between blocks in clover
+  int offset_to_D = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
+  int offset_to_B = 2*offset_to_D; // B comes after A and D
+
+  PRECISION *out_tmp = clover_vectorized;
+
+  // we zero out the padded area to avoid potential PRECISIONing-point errors
+  // cloverD_vectorized is
+  // AB
+  // CD
+  // 00
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // A
+        // primed indices to transpose input when necessary to get lower triangle of output
+        int ip = i;
+        int jp = j;
+        PRECISION sign = 1.0;
+        if(j > i) {
+          ip = j;
+          jp = i;
+          sign = -1.0;
+        }
+        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
+        out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]);
+        out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]);
+        // C = -B^dagger
+        out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]);
+        out_tmp[(2*i+1)*column_offset + j + vecs] =  cimag(clover[offset_to_B + j*vecs+i]);
+      }
+      // zero
+      for(int j=2*vecs; j<column_offset; j++) {
+        out_tmp[(2*i+0)*column_offset + j] = 0.0;
+        out_tmp[(2*i+1)*column_offset + j] = 0.0;
+      }
+    }
+
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // B
+        out_tmp[(2*(i+vecs)+0)*column_offset + j] = creal(clover[offset_to_B + i*vecs+j]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(clover[offset_to_B + i*vecs+j]);
+        // D
+        // primed indices to transpose input when necessary to get lower triangle of output
+        int ip = i;
+        int jp = j;
+        PRECISION sign = 1.0;
+        if(j > i) {
+          ip = j;
+          jp = i;
+          sign = -1.0;
+        }
+        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
+        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]);
+      }
+      // zero
+      for(int j=2*vecs; j<column_offset; j++) {
+        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
+        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
+      }
+    }
+    clover += offset_to_B + vecs*vecs;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*column_offset*2*vecs;
+  }
+}
+
+void copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(config_PRECISION clover,
+                                                                        OPERATOR_TYPE_PRECISION *clover_vectorized,
+                                                                        int num_aggregates, int num_eig_vect) {
+
+  int vecs = num_eig_vect;
+  // in vectorized layout clover is stored column wise, but not split into ABCD
+  // each column is padded, such that next column can also start at 64B boundary
+  int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  // offset between blocks in clover
+  int offset_to_D = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
+  int offset_to_B = 2*offset_to_D; // B comes after A and D
+
+  PRECISION *out_tmp = clover_vectorized;
+
+  // cloverD_vectorized is
+  // A0B0
+  // 0A0B
+  // C0D0
+  // 0C0D
+  // 0000  we zero out the padded area to avoid potential PRECISIONing-point errors
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // primed indices to transpose input when necessary to get lower triangle of output
+        int ip = i;
+        int jp = j;
+        PRECISION sign = 1.0;
+        if(j > i) {
+          ip = j;
+          jp = i;
+          sign = -1.0;
+        }
+        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
+        // A
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_column+jp]);
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] = sign*cimag(clover[offset_to_column+jp]);
+        // B
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_B + i*vecs+j]);
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 0*vecs] = 
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 1*vecs] = cimag(clover[offset_to_B + i*vecs+j]);
+        // C = -B^dagger
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 2*vecs] =
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 3*vecs] = -creal(clover[offset_to_B + j*vecs+i]);
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 2*vecs] =
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 3*vecs] = cimag(clover[offset_to_B + j*vecs+i]);
+        // D
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] = 
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] = creal(clover[offset_to_D + offset_to_column+jp]);
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] = 
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]);
+        // 0
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] =
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] = 
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 3*vecs] =
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 3*vecs] = 
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 2*vecs] =
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 2*vecs] =
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 1*vecs] =
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 1*vecs] =
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] =
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] =
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 0*vecs] =
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] =
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] = 0.0;
+      }
+      // zero
+      for(int j=4*vecs; j<column_offset; j++) {
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j] = 
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j] = 
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j] = 
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j] = 
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j] = 
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j] = 
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j] = 
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j] = 0.0;
+      }
+    }
+    clover += offset_to_B + vecs*vecs;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*4*vecs*column_offset;
+  }
+}
+
+
+void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized,
+                                                int num_aggregates, int num_eig_vect) {
+#ifdef HAVE_TM
+  int vecs = num_eig_vect;
+  // in vectorized layout clover is stored column wise, but not split into ABCD
+  // each column is padded, such that next column can also start at 64B boundary
+  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  // offset between blocks in clover
+  int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
+
+  PRECISION *out_tmp = clover_vectorized;
+
+  // we add the tm term to cloverD_vectorized
+  // AB   E0
+  // CD + 0F
+  // 00   00
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // primed indices to transpose input when necessary to get lower triangle of output
+        int ip = i;
+        int jp = j;
+        PRECISION sign = 1.0;
+        if(j > i) {
+          ip = j;
+          jp = i;
+          sign = -1.0;
+        }
+        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
+        // E
+        out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]);
+        out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]);
+        // F
+        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_F + offset_to_column+jp]);
+        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_F + offset_to_column+jp]);
+      }
+    }
+    tm_term += 2*offset_to_F;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*column_offset*2*vecs;
+  }
+#endif
+}
+
+void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized,
+                                                int num_aggregates, int num_eig_vect) {
+#ifdef HAVE_TM
+  int vecs = num_eig_vect;
+  // in vectorized layout clover is stored column wise, but not split into ABCD
+  // each column is padded, such that next column can also start at 64B boundary
+  int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  // offset between blocks in clover
+  int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
+
+  PRECISION *out_tmp = clover_vectorized;
+
+  // we add/sub the tm term to cloverD_vectorized
+  // A0B0   E000   0000
+  // 0A0B + 0000 - 0E00
+  // C0D0   00F0   0000
+  // 0C0D   0000   000F
+  // 0000   0000   0000
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // primed indices to transpose input when necessary to get lower triangle of output
+        int ip = i;
+        int jp = j;
+        PRECISION sign = 1.0;
+        if(j > i) {
+          ip = j;
+          jp = i;
+          sign = -1.0;
+        }
+        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
+        // E
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(tm_term[offset_to_column+jp]);
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] += cimag(tm_term[offset_to_column+jp]); 
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] -= sign*creal(tm_term[offset_to_column+jp]);
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] -= cimag(tm_term[offset_to_column+jp]);
+        // F
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(tm_term[offset_to_F+offset_to_column+jp]);
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] += cimag(tm_term[offset_to_F+offset_to_column+jp]);
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] -= sign*creal(tm_term[offset_to_F+offset_to_column+jp]);
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] -= cimag(tm_term[offset_to_F+offset_to_column+jp]);
+     }
+    }
+    tm_term += 2*offset_to_F;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*4*vecs*column_offset;
+  }
+#endif
+}
+
+void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, OPERATOR_TYPE_PRECISION *clover_vectorized,
+                                                            int num_aggregates, int num_eig_vect) {
+#ifdef HAVE_TM1p1
+  int vecs = num_eig_vect;
+  // in vectorized layout clover is stored column wise, but not split into ABCD
+  // each column is padded, such that next column can also start at 64B boundary
+  int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  // offset between blocks in clover
+  int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
+
+  PRECISION *out_tmp = clover_vectorized;
+
+  // we add the eps term to cloverD_vectorized
+  // A0B0   0E00
+  // 0A0B + E000
+  // C0D0   000F
+  // 0C0D   00F0
+  // 0000   0000
+  // (column wise, size of zeros such that columns length is multiple of 64B)
+
+  // 4 directions
+  for ( int a=0; a<num_aggregates; a++ ) {
+    for(int i=0; i<vecs; i++) {
+      for(int j=0; j<vecs; j++) {
+        // primed indices to transpose input when necessary to get lower triangle of output
+        int ip = i;
+        int jp = j;
+        PRECISION sign = 1.0;
+        if(j > i) {
+          ip = j;
+          jp = i;
+          sign = -1.0;
+        }
+        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
+        // E
+        out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] += sign*creal(eps_term[offset_to_column+jp]);
+        out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] += cimag(eps_term[offset_to_column+jp]); 
+        out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(eps_term[offset_to_column+jp]);
+        out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] += cimag(eps_term[offset_to_column+jp]);
+        // F
+        out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]);
+        out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]);
+        out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]);
+        out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]);
+     }
+    }
+    eps_term += 2*offset_to_F;
+    // out_tmp is an alias for the actual output
+    out_tmp += 2*4*vecs*column_offset;
+  }
+#endif
+}
+
+void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
+                                                           complex_PRECISION *phi, schwarz_PRECISION_struct *s,
+                                                           level_struct *l, int site, int *direction_flags ) {
+
+  int offset = SIMD_LENGTH_PRECISION;
+  int site_offset = l->num_lattice_site_var*offset;
+  int index_bw;
+  int index_fw;
+  int *neighbor = s->op.neighbor_table;
+  int *backward_neighbor = s->op.backward_neighbor_table;
+  complex_PRECISION *phi_pt;
+  config_PRECISION D_pt;
+  config_PRECISION D = s->op.D;
+  int n = l->num_lattice_site_var;
+  int D_site_offset = 4*n*n;
+  int D_link_offset = n*n;
+  int clover_offset = (n*(n+1))/2*site;
+
+  coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l );
+
+  for(int mu=0; mu<4; mu++) {
+    index_fw  = neighbor[5*site+1 + mu];
+    index_bw  = backward_neighbor[5*site+1 + mu];
+
+    // from backward
+    if ( direction_flags[2*mu+0] == 1 ) {
+      D_pt = D + D_site_offset*index_bw + D_link_offset*mu;
+      phi_pt = phi + site_offset*index_bw;
+      coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
+    }
+
+    // from forward
+    if ( direction_flags[2*mu+1] == 1 ) {
+      D_pt = D + D_site_offset*site + D_link_offset*mu;
+      phi_pt = phi + site_offset*index_fw;
+      coarse_spinwise_pn_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l, -1 );
+    }
+  }
+}
+
+void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
+                                                           complex_PRECISION *phi, schwarz_PRECISION_struct *s,
+                                                           level_struct *l, int site ) {
+
+  int offset = SIMD_LENGTH_PRECISION;
+  int site_offset = l->num_lattice_site_var*offset;
+  int n = l->num_parent_eig_vect;
+  int block_offset = (n*(n+1))*site;
+  config_PRECISION block = s->op.odd_proj+block_offset;
+  int num_eig_vect = l->num_parent_eig_vect;
+  int block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
+  complex_PRECISION *eta[2] = {eta1, eta2};  
+  phi += site_offset*site;
+
+  // U(x) = [ A 0      , A=A*, D=D*
+  //          0 D ]
+  // storage order: upper triangle of A, upper triangle of D, B, columnwise
+
+  mm_PRECISION block_re;
+  mm_PRECISION block_im;
+  mm_PRECISION in_re;
+  mm_PRECISION in_im;
+  mm_PRECISION out_re;
+  mm_PRECISION out_im;
+
+  // zero output matrices
+  mm_PRECISION zero = mm_setzero_PRECISION();
+  for(int s=0; s<2; s++) {
+    for(int i=0; i<offset; i+=SIMD_LENGTH_PRECISION) {
+      for(int row=0; row<2*num_eig_vect; row++) {
+        mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*offset, zero);
+        mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*offset, zero);
+      }
+    }
+  }
+
+  // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
+  eta[1] += num_eig_vect*offset;
+  for(int s=0; s<2; s++) {
+    // A and D: column major hermitian, stored as upper triangular
+    for(int i=0; i<offset; i+=SIMD_LENGTH_PRECISION) {
+      for(int column=0; column<num_eig_vect; column++) {
+        in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*offset);
+        in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*offset);
+        for(int row=0; row<=column; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*offset);
+          out_im = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*offset);
+          block_re = mm_set1_PRECISION(creal(block[(column*column+column)/2+row]));
+          block_im = mm_set1_PRECISION(cimag(block[(column*column+column)/2+row]));
+
+          cfmadd_PRECISION(block_re, block_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*offset, out_re);
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*offset, out_im);
+        }
+        for(int row=column+1; row<num_eig_vect; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*offset);
+          out_im = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*offset);
+          block_re = mm_set1_PRECISION(creal(block[(row*row+row)/2+column]));
+          block_im = mm_set1_PRECISION(cimag(block[(row*row+row)/2+column]));
+
+          cfmadd_conj_PRECISION(block_re, block_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*offset, out_re);
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*offset, out_im);
+        }
+      }
+    }
+    block += block_step_size;
+    phi += num_eig_vect*offset;
+  }
+
+}
+
+void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
+                                                               complex_PRECISION *phi, const int mu,
+                                                               schwarz_PRECISION_struct *s, level_struct *l, int site ) {
+
+  int offset = SIMD_LENGTH_PRECISION;
+  int site_offset = l->num_lattice_site_var*offset;
+  int index_fw;
+  int *neighbor = s->op.neighbor_table;
+  complex_PRECISION *phi_pt;
+  config_PRECISION D_pt;
+  config_PRECISION D = s->op.D;
+  int n = l->num_lattice_site_var;
+  int D_site_offset = 4*n*n;
+  int D_link_offset = n*n;
+
+  vector_PRECISION_define_zero( eta1, 0, n*offset, l, no_threading );
+  vector_PRECISION_define_zero( eta2, 0, n*offset, l, no_threading );
+
+  // requires the positive boundaries of phi to be communicated before
+  index_fw  = neighbor[5*site+1 + mu];
+  D_pt = D + D_site_offset*site + D_link_offset*mu;
+  phi_pt = phi + site_offset*index_fw;
+  coarse_spinwise_pn_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l, +1 );
+}
+
+
+void coarse_spinwise_site_self_couplings_PRECISION_vectorized(
+    complex_PRECISION *eta1, complex_PRECISION *eta2,
+    complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) {
+  
+  int num_eig_vect = l->num_lattice_site_var/2;
+  int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2;
+  complex_PRECISION *eta[2] = {eta1, eta2};
+  // U(x) = [ A B      , A=A*, D=D*, C = -B*
+  //          C D ]
+  // storage order: upper triangle of A, upper triangle of D, B, columnwise
+
+  mm_PRECISION clover_re;
+  mm_PRECISION clover_im;
+  mm_PRECISION in_re;
+  mm_PRECISION in_im;
+  mm_PRECISION out_re;
+  mm_PRECISION out_im;
+
+  // zero output matrices
+  mm_PRECISION zero = mm_setzero_PRECISION();
+  for(int s=0; s<2; s++) {
+    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+      for(int row=0; row<2*num_eig_vect; row++) {
+        mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements, zero);
+        mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements, zero);
+      }
+    }
+  }
+
+  // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
+  eta[1] += num_eig_vect*elements;
+  for(int s=0; s<2; s++) {
+    // A and D: column major hermitian, stored as upper triangular
+    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+      for(int column=0; column<num_eig_vect; column++) {
+        in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+        in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+        for(int row=0; row<=column; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements);
+          out_im = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements);
+          clover_re = mm_set1_PRECISION(creal(clover[(column*column+column)/2+row]));
+          clover_im = mm_set1_PRECISION(cimag(clover[(column*column+column)/2+row]));
+
+          cfmadd_PRECISION(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements, out_re);
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements, out_im);
+        }
+        for(int row=column+1; row<num_eig_vect; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements);
+          out_im = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements);
+          clover_re = mm_set1_PRECISION(creal(clover[(row*row+row)/2+column]));
+          clover_im = mm_set1_PRECISION(cimag(clover[(row*row+row)/2+column]));
+
+          cfmadd_conj_PRECISION(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements, out_re);
+          mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements, out_im);
+        }
+      }
+    }
+    clover += clover_step_size1;
+    phi += num_eig_vect*elements;
+  }
+  // rewind phi back to upper components
+  phi -= 2*num_eig_vect*elements;
+  eta[0] += num_eig_vect*elements;
+  eta[1] -= num_eig_vect*elements;
+  // C = -B^{\dagger}
+  for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+    for(int column=0; column<num_eig_vect; column++) {
+      in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+      in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+      for(int row=0; row<num_eig_vect; row++) {
+        out_re = mm_load_PRECISION((PRECISION *)eta[0] + i + (2*row+0)*elements);
+        out_im = mm_load_PRECISION((PRECISION *)eta[0] + i + (2*row+1)*elements);
+        // load transposed B
+        clover_re = mm_set1_PRECISION(creal(clover[row*num_eig_vect+column]));
+        clover_im = mm_set1_PRECISION(cimag(clover[row*num_eig_vect+column]));
+
+        cfnmadd_conj_PRECISION(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
+
+        mm_store_PRECISION((PRECISION *)eta[0] + i + (2*row+0)*elements, out_re);
+        mm_store_PRECISION((PRECISION *)eta[0] + i + (2*row+1)*elements, out_im);
+      }
+    }
+  }
+  phi += num_eig_vect*elements;
+  // B
+  for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+    for(int column=0; column<num_eig_vect; column++) {
+      in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+      in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+      for(int row=0; row<num_eig_vect; row++) {
+        out_re = mm_load_PRECISION((PRECISION *)eta[1] + i + (2*row+0)*elements);
+        out_im = mm_load_PRECISION((PRECISION *)eta[1] + i + (2*row+1)*elements);
+        clover_re = mm_set1_PRECISION(creal(clover[column*num_eig_vect+row]));
+        clover_im = mm_set1_PRECISION(cimag(clover[column*num_eig_vect+row]));
+
+        cfmadd_PRECISION(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
+
+        mm_store_PRECISION((PRECISION *)eta[1] + i + (2*row+0)*elements, out_re);
+        mm_store_PRECISION((PRECISION *)eta[1] + i + (2*row+1)*elements, out_im);
+      }
+    }
+  }
+}
diff --git a/src/coarse_coupling_generic.h b/src/coarse_coupling_generic.h
new file mode 100644
index 0000000..f6441de
--- /dev/null
+++ b/src/coarse_coupling_generic.h
@@ -0,0 +1,640 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef COARSE_COUPLING_PRECISION_HEADER
+  #define COARSE_COUPLING_PRECISION_HEADER
+
+  #undef COMM_HIDING_COARSEOP
+
+  void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading );
+  void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
+      complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
+  void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
+  // here we do not check whether site is really on boundary, caller is responsible for that
+  // tmp is used to store coarse operator with padding, until sum over all sites has been done
+  void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *buffer1, complex_PRECISION *buffer2,
+      complex_PRECISION *V, const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
+  void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
+  void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
+      complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
+  void set_coarse_block_diagonal_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
+
+  void copy_coarse_operator_to_vectorized_layout_PRECISION(config_PRECISION D,
+      OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect);
+  // fw and bw links have a symmetry that allows constructing one from another, see, e.g., coarse_hopp_PRECISION
+  // for vectorization we store the operator for both cases, the "daggered" links need this transformed layout
+  void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D,
+      OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect);
+  void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover,
+      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
+  void copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(config_PRECISION clover,
+      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
+  void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term,
+      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
+  void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term,
+      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
+  void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term,
+      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
+    
+  void coarse_spinwise_site_self_couplings_PRECISION_vectorized(
+      complex_PRECISION *eta1, complex_PRECISION *eta2,
+      complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l );
+  
+  void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
+      complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
+      int site, int *direction_flags );  
+  
+  void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
+      complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l,
+      int site );  
+
+  void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
+      complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
+      int site);  
+
+
+  static inline void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l ) {
+
+    int vector_size = l->num_lattice_site_var;
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
+    int num_eig_vect = l->num_parent_eig_vect, 
+      clover_size = (2*num_eig_vect*num_eig_vect+num_eig_vect), 
+      block_size = (num_eig_vect*num_eig_vect+num_eig_vect);
+    
+    coarse_self_couplings_clover_PRECISION( eta+start*vector_size, phi+start*vector_size,
+                                            op->clover+start*clover_size, (end-start)*vector_size, l );
+#ifdef HAVE_TM // tm_term
+    if (op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 )
+      coarse_add_anti_block_diagonal_PRECISION( eta+start*vector_size, phi+start*vector_size, 
+                                                op->tm_term+start*block_size, (end-start)*vector_size, l );
+#endif
+#ifdef HAVE_TM1p1 //eps_term
+    if ( g.n_flavours == 2 &&
+         ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) )
+      coarse_add_doublet_coupling_PRECISION( eta+start*vector_size, phi+start*vector_size, 
+                                             op->epsbar_term+start*block_size, (end-start)*vector_size, l );
+#endif
+    
+#else
+    
+    int lda = SIMD_LENGTH_PRECISION*((vector_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+#ifdef HAVE_TM1p1
+    OPERATOR_TYPE_PRECISION *clover = (g.n_flavours == 2) ? op->clover_doublet_vectorized:op->clover_vectorized;
+#else
+    OPERATOR_TYPE_PRECISION *clover = op->clover_vectorized;
+#endif
+    for(int i=start; i<end; i++) {
+      for(int j=0; j<vector_size; j++)
+        eta[i*vector_size+j] = 0.0;
+      cgemv_PRECISION(vector_size, clover+i*2*vector_size*lda, lda, (PRECISION *)(phi+i*vector_size), (PRECISION *)(eta+i*vector_size));
+    }
+#endif
+  }
+
+
+  // eta +/-= D*phi, D stored columnwise
+  static inline void pnmv_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
+                                     const vector_PRECISION phi, const register int n,
+                                     const int sign ) {
+    register int i, j, k=0;
+    
+    for ( i=0; i<n; i++ )
+      for ( j=0; j<n; j++, k++ )
+        eta[j] += sign*D[k]*phi[i];
+  }
+
+  // eta +/-= D^Dagger*phi, D stored columnwise
+  static inline void pnmvh_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
+                                      const vector_PRECISION phi, const register int n,
+                                      const int sign ) {
+    register int i, j, k=0;    
+
+    for ( i=0; i<n; i++ )
+      for ( j=0; j<n; j++, k++ )
+        eta[i] += sign*conj_PRECISION(D[k])*phi[j];
+  }
+
+  static inline void coarse_pn_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+                                               config_PRECISION D, const int sign,
+                                               level_struct *l ) {
+  
+    int num_eig_vect = l->num_parent_eig_vect,
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
+    //             C D ]                        -B*  D* ]
+    // storage order: A, C, B, D
+    // note: minus sign of D = self_coupling - hopping_term is added here
+
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      // A  
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      eta += num_eig_vect;//1
+      phi += num_eig_vect;//1
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // C
+      eta += num_eig_vect;//2
+      phi -= num_eig_vect;//0
+      D += num_eig_vect2;
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      eta += num_eig_vect;//3
+      phi += num_eig_vect;//1
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // B
+      eta -= 3*num_eig_vect;//0
+      phi += num_eig_vect;//2
+      D += num_eig_vect2;
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      eta += num_eig_vect;//1
+      phi += num_eig_vect;//3
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // D
+      eta += num_eig_vect;//2
+      phi -= num_eig_vect;//2
+      D += num_eig_vect2;
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      eta += num_eig_vect;//3
+      phi += num_eig_vect;//3
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+    } else {
+#endif
+      // A  
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // C
+      eta += num_eig_vect;
+      D += num_eig_vect2;
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // B
+      phi += num_eig_vect;
+      eta -= num_eig_vect;
+      D += num_eig_vect2;
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // D
+      eta += num_eig_vect;
+      D += num_eig_vect2;
+      pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign );
+#ifdef HAVE_TM1p1
+    }
+#endif
+  }
+
+  static inline void coarse_pn_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+                                                        config_PRECISION D, const int sign,
+                                                        level_struct *l ) {
+    
+    int num_eig_vect = l->num_parent_eig_vect,
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
+    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
+    //             C D ]                        -B*  D* ]
+    // storage order: A, C, B, D
+    // note: minus sign of D = self_coupling - hopping_term is added here
+
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      // A* 
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      eta += num_eig_vect;//1
+      phi += num_eig_vect;//1
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // -C*
+      eta -= num_eig_vect;//0
+      phi += num_eig_vect;//2
+      D += num_eig_vect2;
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign );
+      eta += num_eig_vect;//1
+      phi += num_eig_vect;//3
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign );
+      // -B*
+      eta += num_eig_vect;//2
+      phi -= 3*num_eig_vect;//0
+      D += num_eig_vect2;
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign );
+      eta += num_eig_vect;//3
+      phi += num_eig_vect;//1
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign );
+      // D*
+      eta -= num_eig_vect;//2
+      phi += num_eig_vect;//2
+      D += num_eig_vect2;
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      eta += num_eig_vect;//3
+      phi += num_eig_vect;//3
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign );
+    } else {
+#endif
+      // A* 
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign );
+      // -C*
+      phi += num_eig_vect;
+      D += num_eig_vect2;
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign );
+      // -B*
+      eta += num_eig_vect;
+      phi -= num_eig_vect;
+      D += num_eig_vect2;
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign );
+      // D*
+      phi += num_eig_vect;
+      D += num_eig_vect2;
+      pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign );
+#ifdef HAVE_TM1p1
+    }
+#endif
+  }
+
+  static inline void coarse_pn_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, const int sign, level_struct *l ) {
+#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+    int nv = l->num_parent_eig_vect;
+    int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+    switch (sign) {
+    case -1:
+      cgemv_padded_PRECISION( 2*nv, D, lda, nv, (float *)phi, (float *)eta);
+      break;
+    case +1:
+    default:
+      cgenmv_padded_PRECISION( 2*nv, D, lda, nv, (float *)phi, (float *)eta);
+      break;
+    }
+#endif
+  }
+
+  static inline void coarse_pn_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, const int amount, const int sign, level_struct *l, struct Thread *threading ) {
+
+    START_NO_HYPERTHREADS(threading)
+      
+    int mu, i, num_site_var=l->num_lattice_site_var,
+      num_eig_vect = l->num_parent_eig_vect,
+      num_lattice_sites, start, end, core_start, core_end,
+      plus_dir_param, minus_dir_param;
+    vector_PRECISION in_pt, out_pt;
+    
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+    int num_link_var = SQUARE(2*num_eig_vect),
+      num_4link_var = 4*num_link_var;
+    config_PRECISION D_pt;
+    // dagger applied by functions daggered_hopp below
+    config_PRECISION D = op->D, D_dagger = op->D;
+#else
+    int column_offset = 2*SIMD_LENGTH_PRECISION*((num_eig_vect+SIMD_LENGTH_PRECISION-1)/
+                                                 SIMD_LENGTH_PRECISION),
+      num_link_var = 2*2*num_eig_vect*column_offset,
+      num_4link_var = 4*num_link_var;
+    OPERATOR_TYPE_PRECISION *D_pt;
+    // dagger applied in D_dagger
+    OPERATOR_TYPE_PRECISION *D = op->D_vectorized, *D_dagger = op->D_transformed_vectorized;
+#endif
+    
+#ifndef COMM_HIDING_COARSEOP
+    int communicate = ( l->num_processes > 1 && op->c.comm ) ? 1:0;
+    int *neighbor_fw = op->neighbor_table;
+    int *neighbor_bw = op->backward_neighbor_table;
+#else
+    int communicate = ( op->c.comm ) ? 1:0;
+    int *neighbor_fw = op->neighbor_table;
+#endif
+    
+    switch (amount) {
+    case _EVEN_SITES:
+      minus_dir_param = _ODD_SITES;
+      plus_dir_param = _EVEN_SITES;
+      break;
+    case _ODD_SITES:
+      minus_dir_param = _EVEN_SITES;
+      plus_dir_param = _ODD_SITES;
+      break;
+    case _FULL_SYSTEM:
+    default:
+      minus_dir_param = _FULL_SYSTEM;
+      plus_dir_param = _FULL_SYSTEM;
+      break;
+    }
+    
+    // assumptions (1) self coupling has already been performed
+    //          OR (2) "out" is initialized with zeros
+    set_boundary_PRECISION( out, 0, l, threading );
+    
+    // communicate in -mu direction
+    MASTER(threading)
+      if ( communicate ) 
+        for ( mu=0; mu<4; mu++ ) 
+          ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
+    SYNC_CORES(threading);
+    
+    switch (amount) {
+    case _EVEN_SITES:
+      start = op->num_even_sites;
+      num_lattice_sites = op->num_odd_sites;
+      break;
+    case _ODD_SITES:
+      start = 0;
+      num_lattice_sites = op->num_even_sites;
+      break;
+    case _FULL_SYSTEM:
+    default:
+      start=0;
+      num_lattice_sites=l->num_inner_lattice_sites;
+      break;
+    }
+    end = start + num_lattice_sites;
+    compute_core_start_end_custom( start, end, &core_start, &core_end, l, threading, 1 );
+    
+#ifndef COMM_HIDING_COARSEOP
+    // prepare for sending to fw: compute hopping terms into forward boundary buffer
+    if ( communicate ) 
+      for ( i=core_start; i<core_end; i++ ) {
+        in_pt = in + num_site_var*neighbor_fw[5*i];
+        D_pt = D_dagger + num_4link_var*neighbor_fw[5*i];
+        for ( mu=0; mu<4; mu++ ) {
+          if(neighbor_fw[5*i+1+mu] < l->num_inner_lattice_sites) //num_lattice_sites?
+            continue;
+          out_pt = out + num_site_var*neighbor_fw[5*i+1+mu];
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+          coarse_pn_daggered_hopp_PRECISION( out_pt, in_pt, D_pt+mu*num_link_var, sign, l );
+#else
+          coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt+mu*num_link_var, sign, l );
+#endif
+        }
+      }
+#else
+    // compute U_mu^dagger coupling
+    for ( mu=0; mu<4; mu++ ) {
+      for ( i=core_start; i<core_end; i++ ) {
+        in_pt = in + num_site_var*neighbor_fw[5*i];
+        D_pt = D_dagger + num_4link_var*neighbor_fw[5*i] + mu*num_link_var;
+        out_pt = out + num_site_var*neighbor_fw[5*i+1+mu];
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+        coarse_pn_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, sign, l );
+#else
+        coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt, sign, l );
+#endif
+      }
+    SYNC_CORES(threading);
+    }
+#endif
+    
+    if ( communicate ) {
+      START_LOCKED_MASTER(threading)
+        for ( mu=0; mu<4; mu++ ) {
+          // communicate in +mu direction
+          ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
+        }
+      for ( mu=0; mu<4; mu++ ) {
+        // wait for -mu direction
+        ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
+      }
+      END_LOCKED_MASTER(threading);
+    }
+    else
+      SYNC_CORES(threading);
+    
+    switch (amount) {
+    case _EVEN_SITES:
+      start = 0;
+      num_lattice_sites = op->num_even_sites;
+      break;
+    case _ODD_SITES:
+      start = op->num_even_sites;
+      num_lattice_sites = op->num_odd_sites;
+      break;
+    case _FULL_SYSTEM:
+    default:
+      start=0;
+      num_lattice_sites=l->num_inner_lattice_sites;
+      break;
+    }
+    end = start + num_lattice_sites;
+    compute_core_start_end_custom( start, end, &core_start, &core_end, l, threading, 1 );
+  
+#ifndef COMM_HIDING_COARSEOP
+    for ( i=core_start; i<core_end; i++ ) {
+      out_pt = out + num_site_var*neighbor_fw[5*i];
+      
+      // compute U_mu^dagger coupling
+      for( mu=0; mu<4; mu++ ) {
+        // terms coming from backward boundary buffer are done by the ghost_wait_PRECISION below
+        if(neighbor_bw[5*i+1+mu] >= l->num_inner_lattice_sites)
+          continue;
+        in_pt = in + num_site_var*neighbor_bw[5*i+1+mu];
+        D_pt = D_dagger + num_4link_var*neighbor_bw[5*i+1+mu] + mu*num_link_var;
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+        coarse_pn_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, sign, l );
+#else
+        coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt, sign, l );
+#endif
+      }
+      
+      // compute U_mu couplings
+      D_pt = D + num_4link_var*neighbor_fw[5*i];
+      for( mu=0; mu<4; mu++ ) {
+        in_pt = in + num_site_var*neighbor_fw[5*i+1+mu];
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+        coarse_pn_hopp_PRECISION( out_pt, in_pt, D_pt+mu*num_link_var, sign, l );
+#else
+        coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt+mu*num_link_var, sign, l );
+#endif
+      }
+    }
+#else
+    // compute U_mu couplings
+    for ( i=core_start; i<core_end; i++ ) {
+      out_pt = out + num_site_var*neighbor_fw[5*i];
+      D_pt = D + num_4link_var*neighbor_fw[5*i];
+      for( mu=0; mu<4; mu++ ) {
+        in_pt = in + num_site_var*neighbor_fw[5*i+1+mu];    
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+        coarse_pn_hopp_PRECISION( out_pt, in_pt, D_pt+mu*num_link_var, sign, l );
+#else
+        coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt+mu*num_link_var, sign, l );
+#endif
+      }
+    }
+#endif
+    
+    // wait for terms from bw and add them
+    if ( communicate ) {
+      START_LOCKED_MASTER(threading)
+        for ( mu=0; mu<4; mu++ ) {
+          // wait for +mu direction
+          ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
+        }
+      END_LOCKED_MASTER(threading);
+    }
+    else
+      SYNC_CORES(threading);
+    
+    END_NO_HYPERTHREADS(threading);
+  }
+
+static inline void coarse_spinwise_pn_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l, const int sign ) {
+
+    int num_eig_vect = l->num_lattice_site_var/2;
+    int num_eig_vect2 = num_eig_vect*num_eig_vect;
+    complex_PRECISION *eta[2] = {eta1, eta2};
+    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
+    //             C D ]                        -B*  D* ]
+    // storage order: A, C, B, D
+
+    mm_PRECISION D_re;
+    mm_PRECISION D_im;
+    mm_PRECISION in_re;
+    mm_PRECISION in_im;
+    mm_PRECISION out_re;
+    mm_PRECISION out_im;
+    // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
+    for(int s=0; s<2; s++) {
+      // t is the row of the input matrix (in 2x2 block form)
+      for(int t=0; t<2; t++) {
+        for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+          for(int column=0; column<num_eig_vect; column++) {
+            in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+            in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+            for(int row=0; row<num_eig_vect; row++) {
+              out_re = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements);
+              out_im = mm_load_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements);
+              D_re = mm_set1_PRECISION(creal(D[column*num_eig_vect+row]));
+              D_im = mm_set1_PRECISION(cimag(D[column*num_eig_vect+row]));
+
+              switch (sign) {
+              case -1:
+                cfnmadd_PRECISION(D_re, D_im, in_re, in_im, &out_re, &out_im);
+                break;
+              case +1:
+              default:
+                cfmadd_PRECISION(D_re, D_im, in_re, in_im, &out_re, &out_im);
+                break;
+              }
+
+              mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+0)*elements, out_re);
+              mm_store_PRECISION((PRECISION *)eta[s] + i + (2*row+1)*elements, out_im);
+            }
+          }
+        }
+        eta[s] += num_eig_vect*elements;
+        D += num_eig_vect2;
+      }
+      phi += num_eig_vect*elements;
+    }
+  }  
+
+  static inline void coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
+
+    int num_eig_vect = l->num_lattice_site_var/2;
+    int num_eig_vect2 = num_eig_vect*num_eig_vect;
+    complex_PRECISION *eta[2] = {eta1, eta2};
+    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
+    //             C D ]                        -B*  D* ]
+    // storage order: A, C, B, D
+    // note: minus sign of D = self_coupling - hopping_term is added here
+
+    mm_PRECISION D_re;
+    mm_PRECISION D_im;
+    mm_PRECISION in_re;
+    mm_PRECISION in_im;
+    mm_PRECISION out_re;
+    mm_PRECISION out_im;
+    // A*
+    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+      for(int column=0; column<num_eig_vect; column++) {
+        in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+        in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+        for(int row=0; row<num_eig_vect; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[0] + i + (2*row+0)*elements);
+          out_im = mm_load_PRECISION((PRECISION *)eta[0] + i + (2*row+1)*elements);
+          // load transpose
+          D_re = mm_set1_PRECISION(creal(D[row*num_eig_vect+column]));
+          D_im = mm_set1_PRECISION(cimag(D[row*num_eig_vect+column]));
+
+          cfnmadd_conj_PRECISION(D_re, D_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[0] + i + (2*row+0)*elements, out_re);
+          mm_store_PRECISION((PRECISION *)eta[0] + i + (2*row+1)*elements, out_im);
+        }
+      }
+    }
+    // -C*
+    phi += num_eig_vect*elements;
+    D += num_eig_vect2;
+    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+      for(int column=0; column<num_eig_vect; column++) {
+        in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+        in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+        for(int row=0; row<num_eig_vect; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[1] + i + (2*row+0)*elements);
+          out_im = mm_load_PRECISION((PRECISION *)eta[1] + i + (2*row+1)*elements);
+          // load transpose
+          D_re = mm_set1_PRECISION(creal(D[row*num_eig_vect+column]));
+          D_im = mm_set1_PRECISION(cimag(D[row*num_eig_vect+column]));
+
+          cfmadd_conj_PRECISION(D_re, D_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[1] + i + (2*row+0)*elements, out_re);
+          mm_store_PRECISION((PRECISION *)eta[1] + i + (2*row+1)*elements, out_im);
+        }
+      }
+    }
+    // -B*
+    eta[0] += num_eig_vect*elements;
+    phi -= num_eig_vect*elements;
+    D += num_eig_vect2;
+    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+      for(int column=0; column<num_eig_vect; column++) {
+        in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+        in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+        for(int row=0; row<num_eig_vect; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[0] + i + (2*row+0)*elements);
+          out_im = mm_load_PRECISION((PRECISION *)eta[0] + i + (2*row+1)*elements);
+          // load transpose
+          D_re = mm_set1_PRECISION(creal(D[row*num_eig_vect+column]));
+          D_im = mm_set1_PRECISION(cimag(D[row*num_eig_vect+column]));
+
+          cfmadd_conj_PRECISION(D_re, D_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[0] + i + (2*row+0)*elements, out_re);
+          mm_store_PRECISION((PRECISION *)eta[0] + i + (2*row+1)*elements, out_im);
+        }
+      }
+    }
+    // D*
+    eta[1] += num_eig_vect*elements;
+    phi += num_eig_vect*elements;
+    D += num_eig_vect2;
+    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
+      for(int column=0; column<num_eig_vect; column++) {
+        in_re  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+0)*elements);
+        in_im  = mm_load_PRECISION((PRECISION *)phi + i + (2*column+1)*elements);
+        for(int row=0; row<num_eig_vect; row++) {
+          out_re = mm_load_PRECISION((PRECISION *)eta[1] + i + (2*row+0)*elements);
+          out_im = mm_load_PRECISION((PRECISION *)eta[1] + i + (2*row+1)*elements);
+          // load transpose
+          D_re = mm_set1_PRECISION(creal(D[row*num_eig_vect+column]));
+          D_im = mm_set1_PRECISION(cimag(D[row*num_eig_vect+column]));
+
+          cfnmadd_conj_PRECISION(D_re, D_im, in_re, in_im, &out_re, &out_im);
+
+          mm_store_PRECISION((PRECISION *)eta[1] + i + (2*row+0)*elements, out_re);
+          mm_store_PRECISION((PRECISION *)eta[1] + i + (2*row+1)*elements, out_im);
+        }
+      }
+    }
+  }
+
+
+#endif
diff --git a/src/coarse_oddeven_generic.c b/src/coarse_oddeven_generic.c
index 5d090cd..f31555f 100644
--- a/src/coarse_oddeven_generic.c
+++ b/src/coarse_oddeven_generic.c
@@ -21,78 +21,74 @@
 
 #include "main.h"
 
-#ifndef HAVE_TM
-void coarse_selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, config_PRECISION input, level_struct *l )
-  // input = [ A B      , A=A*, D=D*, C = -B*
-  //           C D ]
-  //
-  // order: upper triangle of A, upper triangle of D, B, each column major
-#else
-void coarse_selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, config_PRECISION input,
-						     config_PRECISION input_anti, level_struct *l ) 
-  // input = [ A B      , A=A*, D=D*, C = -B*
-  //           C D ]
+void coarse_selfcoupling_LU_decomposition_PRECISION( config_PRECISION output, operator_PRECISION_struct *op, int index, level_struct *l ) {
+
+  // clover = [ A B      , A=A*, D=D*, C = -B*
+  //            C D ]
   //
   // order: upper triangle of A, upper triangle of D, B, each column major
   //
-  // input_anti = [ E 0      , E=-E*, F=-F* diag. excluded
-  //                0 F ]
+  // tm_term = [ E 0      , E=-E*, F=-F* diag. excluded
+  //             0 F ]
   //
   // order: upper triangle of E, upper triangle of F
-#endif
-{
-  register int i, j, k, n = l->num_lattice_site_var/2, n2 = l->num_lattice_site_var;
-  
-  // set the matrix up
+  //
+  // output = [ A+E  B   
+  //             C  D+F ] LU decomposed
+
+  register int i, j, k, n = l->num_parent_eig_vect, n2 = 2*n;
+  config_PRECISION clover = op->clover + n*(n2+1)*index;
   // A
   for ( j=0; j<n; j++ ) {
     for ( i=0; i<j; i++ ) {
-      output[n2*i+j] = *input;
-      output[i+n2*j] = conj_PRECISION(*input);
-      input++;      
+      output[n2*i+j] = *clover;
+      output[i+n2*j] = conj_PRECISION(*clover);
+      clover++;      
     }
-    output[(n2+1)*j] = *input;
-    input++; // diagonal entry
+    output[(n2+1)*j] = *clover;
+    clover++; // diagonal entry
   }
   // D
   for ( j=n; j<n2; j++ ) {
     for ( i=n; i<j; i++ ) {
-      output[n2*i+j] = *input;
-      output[i+n2*j] = conj_PRECISION(*input);
-      input++;      
+      output[n2*i+j] = *clover;
+      output[i+n2*j] = conj_PRECISION(*clover);
+      clover++;      
     }
-    output[(n2+1)*j] = *input;
-    input++; // diagonal entry
+    output[(n2+1)*j] = *clover;
+    clover++; // diagonal entry
   }
-  // B
+  // B and C
   for ( j=n; j<n2; j++ ) {
     for ( i=0; i<n; i++ ) {
-      output[n2*i+j] = *input;
-      output[i+n2*j] = -conj_PRECISION(*input);
-      input++;      
+      output[n2*i+j] = *clover;
+      output[i+n2*j] = -conj_PRECISION(*clover);
+      clover++;      
     }
   }
+
 #ifdef HAVE_TM
-  // E
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) {
+  config_PRECISION tm_term = op->tm_term + n*(n+1)*index;
+  if (op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) {
+    // E
     for ( j=0; j<n; j++ ) {
       for ( i=0; i<j; i++ ) {
-	output[n2*i+j] += *input_anti;
-	output[i+n2*j] += -conj_PRECISION(*input_anti);
-	input_anti++;      
+        output[n2*i+j] += *tm_term;
+        output[i+n2*j] += -conj_PRECISION(*tm_term);
+        tm_term++;      
       }
-      output[(n2+1)*j] += *input_anti;
-      input_anti++; // diagonal entry
+      output[(n2+1)*j] += *tm_term;
+      tm_term++; // diagonal entry
     }
     // F
     for ( j=n; j<n2; j++ ) {
       for ( i=n; i<j; i++ ) {
-	output[n2*i+j] += *input_anti;
-	output[i+n2*j] += -conj_PRECISION(*input_anti);
-	input_anti++;      
+        output[n2*i+j] += *tm_term;
+        output[i+n2*j] += -conj_PRECISION(*tm_term);
+        tm_term++;      
       }
-      output[(n2+1)*j] += *input_anti;
-      input_anti++; // diagonal entry
+      output[(n2+1)*j] += *tm_term;
+      tm_term++; // diagonal entry
     }
   }
 #endif
@@ -110,6 +106,153 @@ void coarse_selfcoupling_LU_decomposition_PRECISION( const config_PRECISION outp
   }
 }
 
+#ifdef HAVE_TM1p1
+void coarse_selfcoupling_LU_doublet_decomposition_PRECISION( config_PRECISION output, operator_PRECISION_struct *op, int index, level_struct *l ) {
+
+  // clover = [ A B      , A=A*, D=D*, C = -B*
+  //            C D ]
+  //
+  // order: upper triangle of A, upper triangle of D, B, each column major
+  //
+  // tm_term = [ E 0      , E=-E*, F=-F* diag. excluded
+  //             0 F ]
+  //
+  // order: upper triangle of E, upper triangle of F
+  //
+  // epsbar_term = [ G 0      , G=-G*, H=-H* diag. excluded
+  //                 0 H ]
+  //
+  // order: upper triangle of G, upper triangle of H
+  //
+  // output = [ A+E  G   B   0
+  //             G  A-E  0   B
+  //             C   0  D+F  H
+  //             0   C   H  D-F ]  LU decomposed
+
+  register int i, j, k, n = l->num_parent_eig_vect, n2 = 2*n, n3 = 3*n, n4 = 4*n;
+  // set the matrix up
+  // 0
+  for ( j=0; j<n; j++ ) {
+    for ( i=0; i<n; i++ ) {
+      output[n4*(i+0 )+(j+n3)] = _COMPLEX_PRECISION_ZERO;
+      output[n4*(i+n )+(j+n2)] = _COMPLEX_PRECISION_ZERO;
+      output[n4*(i+n2)+(j+n )] = _COMPLEX_PRECISION_ZERO;
+      output[n4*(i+n3)+(j+0 )] = _COMPLEX_PRECISION_ZERO;
+    }
+  }
+
+  config_PRECISION clover = op->clover + n*(n2+1)*index;
+  // A
+  for ( j=0; j<n; j++ ) {
+    for ( i=0; i<j; i++ ) {
+      output[n4*i+j] = *clover;
+      output[i+n4*j] = conj_PRECISION(*clover);
+      output[n4*(i+n)+(j+n)] = *clover;
+      output[(i+n)+n4*(j+n)] = conj_PRECISION(*clover);
+      clover++;      
+    }
+    output[(n4+1)*j] = *clover;
+    output[(n4+1)*(j+n)] = *clover;
+    clover++; // diagonal entry
+  }
+  // D
+  for ( j=n2; j<n3; j++ ) {
+    for ( i=n2; i<j; i++ ) {
+      output[n4*i+j] = *clover;
+      output[i+n4*j] = conj_PRECISION(*clover);
+      output[n4*(i+n)+(j+n)] = *clover;
+      output[(i+n)+n4*(j+n)] = conj_PRECISION(*clover);
+      clover++;      
+    }
+    output[(n4+1)*j] = *clover;
+    output[(n4+1)*(j+n)] = *clover;
+    clover++; // diagonal entry
+  }
+  // B and C
+  for ( j=n2; j<n3; j++ ) {
+    for ( i=0; i<n; i++ ) {
+      output[n4*i+j] = *clover;
+      output[i+n4*j] = -conj_PRECISION(*clover);
+      output[n4*(i+n)+(j+n)] = *clover;
+      output[(i+n)+n4*(j+n)] = -conj_PRECISION(*clover);
+      clover++;      
+    }
+  }
+
+#ifdef HAVE_TM
+  config_PRECISION tm_term = op->tm_term + n*(n+1)*index;
+  if (op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) {
+    // E
+    for ( j=0; j<n; j++ ) {
+      for ( i=0; i<j; i++ ) {
+        output[n4*i+j] += *tm_term;
+        output[i+n4*j] += -conj_PRECISION(*tm_term);
+        output[n4*(i+n)+(j+n)] -= *tm_term;
+        output[(i+n)+n4*(j+n)] -= -conj_PRECISION(*tm_term);
+        tm_term++;      
+      }
+      output[(n4+1)*j] += *tm_term;
+      output[(n4+1)*(j+n)] -= *tm_term;
+      tm_term++; // diagonal entry
+    }
+    // F
+    for ( j=n2; j<n3; j++ ) {
+      for ( i=n2; i<j; i++ ) {
+        output[n4*i+j] += *tm_term;
+        output[i+n4*j] += -conj_PRECISION(*tm_term);
+        output[n4*(i+n)+(j+n)] -= *tm_term;
+        output[(i+n)+n4*(j+n)] -= -conj_PRECISION(*tm_term);
+        tm_term++;      
+      }
+      output[(n4+1)*j] += *tm_term;
+      output[(n4+1)*(j+n)] -= *tm_term;
+      tm_term++; // diagonal entry
+    }
+  }
+#endif
+
+  config_PRECISION epsbar_term = op->epsbar_term + n*(n+1)*index;
+  // G
+  for ( j=n; j<n2; j++ ) {
+    for ( i=0; i<(j-n); i++ ) {
+      output[n4*i+j] = (*epsbar_term);
+      output[(i+n)+n4*(j-n)] = -conj_PRECISION(*epsbar_term);
+      output[n4*(i+n)+(j-n)] = (*epsbar_term);
+      output[i+n4*j] = -conj_PRECISION(*epsbar_term);
+      epsbar_term++;      
+    }
+    output[(n4+1)*(j-n)+n] = (*epsbar_term);
+    output[(n4+1)*j-n] = (*epsbar_term);
+    epsbar_term++; // diagonal entry
+  }
+  // H
+  for ( j=n3; j<n4; j++ ) {
+    for ( i=n2; i<(j-n); i++ ) {
+      output[n4*i+j] = (*epsbar_term);
+      output[(i+n)+n4*(j-n)] = -conj_PRECISION(*epsbar_term);
+      output[n4*(i+n)+(j-n)] = (*epsbar_term);
+      output[i+n4*j] = -conj_PRECISION(*epsbar_term);
+      epsbar_term++;      
+    }
+    output[(n4+1)*(j-n)+n] = (*epsbar_term);
+    output[(n4+1)*j-n] = (*epsbar_term);
+    epsbar_term++; // diagonal entry
+  }
+    
+  // compute LU decomposition
+  // output = triu(L,1) + tril(U,0)
+  // i.e., output contains L and U without the diagonal of L which is equal to 1
+  // order: row major
+  for ( k=0; k<n4; k++ ) {
+    for ( i=k+1; i<n4; i++ ) {
+      output[n4*i+k] = output[n4*i+k]/output[(n4+1)*k]; // output(i,k) = output(i,k)/output(k,k)
+      for ( j=k+1; j<n4; j++ )
+        output[n4*i+j] = output[n4*i+j]-output[n4*i+k]*output[n4*k+j]; // output(i,j) = output(i,j)-output(i,k)*output(k,j)
+    }
+  }
+}
+#endif
+
 
 void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION A, level_struct *l ) {
   
@@ -151,265 +294,164 @@ void coarse_LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x, confi
 }
 
 
-void coarse_diag_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l ) {
+void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
-  coarse_diag_ee_PRECISION( y, x, op, l, no_threading );
-  coarse_diag_oo_PRECISION( y, x, op, l, no_threading );
+  int start, end;
+  compute_core_start_end_custom( 0, op->num_even_sites, &start, &end, l, threading, 1 );
+  // even sites
+  coarse_self_couplings_PRECISION( y, x, op, start, end, l );
 }
 
-
-void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, 
+                               level_struct *l, struct Thread *threading ) {
   
-  int n1 = op->num_even_sites;
-  int start;
-  int end;
-  compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1);
-  // even sites
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int offset = l->num_lattice_site_var;
-  coarse_self_couplings_PRECISION( y+start*offset, x+start*offset, op->clover+start*(offset*offset+offset)/2, (end-start)*offset, l );
-#ifdef HAVE_TM // tm_term
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    coarse_add_anti_block_diagonal_PRECISION( y+start*offset, x+start*offset, op->tm_term+start*(offset*offset/2+offset)/2, (end-start)*offset, l );
-#endif
+  int start, end;
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION 
+  int num_site_var=l->num_lattice_site_var,
+    oo_inv_size = SQUARE(num_site_var);
+#ifdef HAVE_TM1p1
+  config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv;
 #else
-  coarse_self_couplings_PRECISION_vectorized( y, x, op->clover_vectorized, start, end, l );
+  config_PRECISION sc = op->clover_oo_inv;
 #endif
-}
 
+  compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 );
+
+  x += num_site_var*(op->num_even_sites+start);
+  y += num_site_var*(op->num_even_sites+start);  
+  sc += oo_inv_size*start;
 
-void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
-  
-  int n1 = op->num_even_sites, n2 = op->num_odd_sites,
-      offset = l->num_lattice_site_var, ess = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1);
-  config_PRECISION sc = op->clover;
-  int start;
-  int end;
-  compute_core_start_end_custom(n1, n1+n2, &start, &end, l, threading, 1);
-  
-  x += start*offset;
-  y += start*offset;
-  sc += start*ess;
-  
-  // odd sites
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int oss = l->num_lattice_site_var*l->num_lattice_site_var;
   for ( int i=start; i<end; i++ ) {
     coarse_LU_multiply_PRECISION( y, x, sc, l );
-    x += offset;
-    y += offset;
-    sc += oss;
+    x += num_site_var;
+    y += num_site_var;
+    sc += oo_inv_size;
   }
+  
 #else
-  // take care on last level:
-  // - vectorized, but we have stored oo^{-1}, so we cannot use it
-  // - when vectorization is used LU decomposition is not computed, so we also cannot use coarse_LU_multiply_PRECISION
-  // => use standard non-vectorized multiplication
-  if ( l->level == 0 ) {
-    coarse_self_couplings_PRECISION( y, x, sc, (end-start)*offset, l );
-#ifdef HAVE_TM
-    int tms = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-    config_PRECISION tm = op->tm_term + start*tms;
-    if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-      coarse_add_anti_block_diagonal_PRECISION( y, x, tm, (end-start)*offset, l );
-#endif
-  } else
-    coarse_self_couplings_PRECISION_vectorized( y-start*offset, x-start*offset, op->clover_vectorized, start, end, l );
+  compute_core_start_end_custom( op->num_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 );
+  coarse_self_couplings_PRECISION( y, x, op, start, end, l );
 #endif
 }
 
-
-void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+void coarse_diag_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l ) {
   
-  int n1 = op->num_even_sites, n2 = op->num_odd_sites, start, end;
+  coarse_diag_ee_PRECISION( y, x, op, l, no_threading );
+  coarse_diag_oo_PRECISION( y, x, op, l, no_threading );
+}
 
-  compute_core_start_end_custom(n1, n1+n2, &start, &end, l, threading, 1);
+void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, 
+                               level_struct *l, struct Thread *threading ) {
   
-  // odd sites
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int offset = l->num_lattice_site_var, ess = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1),
-      oss = l->num_lattice_site_var*l->num_lattice_site_var;
-  config_PRECISION sc = op->clover;
-  x += start*offset;
-  y += start*offset;
-  sc += n1*ess + (start-n1)*oss;
+  int start, end;
+  compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 );
   
+  // odd sites
+  int num_site_var = l->num_lattice_site_var,
+    oo_inv_size = SQUARE(num_site_var);
+
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
+#ifdef HAVE_TM1p1
+  config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv;
+#else
+  config_PRECISION sc = op->clover_oo_inv;
+#endif
+#else
+  int lda = SIMD_LENGTH_PRECISION*((num_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  oo_inv_size = 2*num_site_var*lda;
+#ifdef HAVE_TM1p1
+  OPERATOR_TYPE_PRECISION *sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv_vectorized:op->clover_oo_inv_vectorized;
+#else
+  OPERATOR_TYPE_PRECISION *sc = op->clover_oo_inv_vectorized;
+#endif
+#endif
+
+  x += num_site_var*(op->num_even_sites+start);
+  y += num_site_var*(op->num_even_sites+start);  
+  sc += oo_inv_size*start;
+
   for ( int i=start; i<end; i++ ) {
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
     coarse_perform_fwd_bwd_subs_PRECISION( y, x, sc, l );
-    x += offset;
-    y += offset;
-    sc += oss;
-  }
 #else
-  coarse_self_couplings_PRECISION_vectorized( y, x, op->clover_vectorized, start, end, l );
+    for(int j=0; j<num_site_var; j++)
+      y[j] = _COMPLEX_PRECISION_ZERO;
+    cgemv_PRECISION( num_site_var, sc, lda, (PRECISION *)x, (PRECISION *)y);
 #endif
+    x += num_site_var;
+    y += num_site_var;
+    sc += oo_inv_size;
+  }
 }
 
-void coarse_oddeven_setup_PRECISION_set_couplings( operator_PRECISION_struct *in, int reorder, level_struct *l, struct Thread *threading ) {
 
-  int i, j, n=l->num_inner_lattice_sites, sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1),
-      nc_size = SQUARE(l->num_lattice_site_var),
-      t, z, y, x;
+void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread *threading ) {
+
   operator_PRECISION_struct *op = &(l->oe_op_PRECISION);
-  config_PRECISION sc_in = in->clover, nc_in = in->D, Aee = NULL, Aoo = NULL;
-  int *le = l->local_lattice;
-  int oe_offset = op->oe_offset;
-  
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int lu_dec_size = SQUARE(l->num_lattice_site_var);
-#endif  
-  
-  Aee = op->clover;
-  Aoo = op->clover + op->num_even_sites*sc_size;
-#ifdef HAVE_TM
-  int jt=0, kt=0, tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-  config_PRECISION tm_in = in->tm_term, TMee = NULL, TMoo = NULL;
-  TMee = op->tm_term;
-  TMoo = op->tm_term + op->num_even_sites*tm_size;
-#endif  
-  
-  START_LOCKED_MASTER(threading)
-  // self coupling  
-  if ( reorder ) {
-    int k=0, index, *it = in->index_table, *dt = in->table_dim;
-    j=0;
-    for ( t=0; t<le[T]; t++ )
-      for ( z=0; z<le[Z]; z++ )
-        for ( y=0; y<le[Y]; y++ )
-          for ( x=0; x<le[X]; x++ ) {
-            index = site_index( t, z, y, x, dt, it );
-            if ( (t+z+y+x+oe_offset)%2 == 1 ) {
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-#ifndef HAVE_TM
-              coarse_selfcoupling_LU_decomposition_PRECISION( Aoo+j, sc_in+sc_size*index, l );
-#else
-	      coarse_selfcoupling_LU_decomposition_PRECISION( Aoo+j, sc_in+sc_size*index, tm_in+tm_size*index, l );
-#endif
-              j+=lu_dec_size;
-#else
-              for ( i=0; i<sc_size; i++, j++ )
-                Aoo[j] = sc_in[ sc_size*index+i ];
-#endif
-#ifdef HAVE_TM
-              for ( i=0; i<tm_size; i++, jt++ )
-                TMoo[jt] = tm_in[ tm_size*index+i ];
-#endif
-            } else {
-              for ( i=0; i<sc_size; i++, k++ )
-                Aee[k] = sc_in[ sc_size*index+i ];
-#ifdef HAVE_TM
-	      for ( i=0; i<tm_size; i++, kt++ )
-                TMee[kt] = tm_in[ tm_size*index+i ];
-#endif  
-            }
-          }
-          
-  } else {
-    j = op->num_even_sites*sc_size;
-    for ( i=0; i<j; i++ )
-      Aee[i] = sc_in[i]; // even sites
-#ifdef HAVE_TM
-    for ( i=0; i<n*tm_size; i++ )
-      TMee[i] = tm_in[i];
-#endif
-    
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-    sc_in += j;
-#ifdef HAVE_TM
-    tm_in += op->num_even_sites*tm_size;
-#endif
-    j = op->num_odd_sites;
-    for ( i=0; i<j; i++ ) {
-#ifndef HAVE_TM
-      coarse_selfcoupling_LU_decomposition_PRECISION( Aoo, sc_in, l ); // odd sites, ompute LU decomposition
-#else
-      coarse_selfcoupling_LU_decomposition_PRECISION( Aoo, sc_in, tm_in, l );
-      tm_in += tm_size;
+  int nv = l->num_parent_eig_vect, start, end;
+
+  coarse_operator_PRECISION_set_self_couplings( op, l, threading );
+  compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1);
+
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
+
+  int size = SQUARE(2*nv);
+  for( int i=start; i<end; i++ )
+    coarse_selfcoupling_LU_decomposition_PRECISION( op->clover_oo_inv+i*size, op, op->num_even_sites+i, l );
+
+#ifdef HAVE_TM1p1
+  int size_doublet = SQUARE(4*nv);
+  for( int i=start; i<end; i++ )
+    coarse_selfcoupling_LU_doublet_decomposition_PRECISION( op->clover_doublet_oo_inv+i*size_doublet, op, 
+                                                            op->num_even_sites+i, l );
 #endif
-      sc_in += sc_size; Aoo += lu_dec_size;
-    }
+
 #else
-    for ( i=op->num_even_sites*sc_size; i<n*sc_size; i++ )
-      Aee[i] = sc_in[i]; // even sites
-#endif
-  }
-  
-  // neighbor couplings
-  if ( reorder ) {
-    int k=0, index, *it = in->index_table, *dt = in->table_dim, site_size=4*nc_size;
-    config_PRECISION oAe=op->D, eAo=(op->D)+site_size*op->num_even_sites;
-    j=0;
-    for ( t=0; t<le[T]; t++ )
-      for ( z=0; z<le[Z]; z++ )
-        for ( y=0; y<le[Y]; y++ )
-          for ( x=0; x<le[X]; x++ ) {
-            index = site_index( t, z, y, x, dt, it );
-            if ( (t+z+y+x+oe_offset)%2 == 1 ) {
-              for ( i=0; i<site_size; i++, j++ ) {
-                eAo[j] = nc_in[ site_size*index+i ];
-              }
-            } else {
-              for ( i=0; i<site_size; i++, k++ ) {
-                oAe[k] = nc_in[ site_size*index+i ];
-              }
-            }
-          }
-          
-  } else {
-    j = n*4*nc_size;
-    for ( i=0; i<j; i++ )
-      op->D[i] = nc_in[i];
-  }
-  END_LOCKED_MASTER(threading)
 
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int start;
-  int end;
-  compute_core_start_end_custom(0, n, &start, &end, l, threading, 1);
-  int n_per_core = end-start;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int offset_v = 2*l->num_lattice_site_var*column_offset;
-  copy_coarse_operator_to_vectorized_layout_PRECISION(
-      op->D + 4*start*nc_size,
-      op->D_vectorized + 4*start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-  copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
-      op->D + 4*start*nc_size,
-      op->D_transformed_vectorized + 4*start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-  copy_coarse_operator_clover_to_vectorized_layout_PRECISION(
-      op->clover + start*sc_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-#ifdef HAVE_TM
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    add_tm_term_to_vectorized_layout_PRECISION(
-      op->tm_term + start*tm_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
+  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int size_v = 2*2*nv*column_offset;
+  for( int i=start; i<end; i++ )
+    cgem_inverse_PRECISION( 2*nv, op->clover_oo_inv_vectorized + i*size_v, 
+                  op->clover_vectorized + (op->num_even_sites+i)*size_v, column_offset );
+
+#ifdef HAVE_TM1p1
+  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int size_doublet_v = 2*4*nv*column_doublet_offset;
+  for( int i=start; i<end; i++ )
+    cgem_inverse_PRECISION( 4*nv, op->clover_doublet_oo_inv_vectorized + i*size_doublet_v, 
+                  op->clover_doublet_vectorized + (op->num_even_sites+i)*size_doublet_v, column_doublet_offset );
 #endif
- SYNC_CORES(threading)
-
-  compute_core_start_end_custom(op->num_even_sites, n, &start, &end, l, threading, 1);
-  OPERATOR_TYPE_PRECISION tmp[offset_v] __attribute__((aligned(64)));
-  for(int a=start; a<end; a++) {
-    for(int i=0; i<offset_v; i++)
-      tmp[i] = (op->clover_vectorized + a*offset_v)[i];
-    cgem_inverse(l->num_lattice_site_var, op->clover_vectorized + a*offset_v, tmp, column_offset);
-  }
 
-  SYNC_CORES(threading)
 #endif
 }
 
-void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l ) {
+void coarse_oddeven_PRECISION_set_couplings( level_struct *l, struct Thread *threading ) {
+
+  coarse_oddeven_PRECISION_set_self_couplings( l, threading );
+  coarse_operator_PRECISION_set_neighbor_couplings( &(l->oe_op_PRECISION), l, threading );
+
+}
+
+void coarse_oddeven_alloc_PRECISION( level_struct *l ) {
 
-  int n=l->num_inner_lattice_sites, oe_offset=0, mu, nu,
-      lu_dec_size = SQUARE(l->num_lattice_site_var),
-      nc_size = SQUARE(l->num_lattice_site_var), bs, **bt = NULL,
-      *eot = NULL, *nt = NULL, *tt = NULL, t, z, y, x, le[4], N[4];
+  int nv = l->num_parent_eig_vect,
+    oe_offset=0, mu, **bt = NULL,
+    *eot = NULL, *nt = NULL, *tt = NULL, t, z, y, x, le[4], N[4];
   operator_PRECISION_struct *op = &(l->oe_op_PRECISION);
 
+  operator_PRECISION_alloc( op, _ODDEVEN, l );
+
+  // buffers
+  MALLOC( op->buffer, complex_PRECISION*, 2 );
+  op->buffer[0] = NULL;
+#ifdef HAVE_TM1p1
+  MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size );
+  op->buffer[1] = op->buffer[0] + 2*l->vector_size;  
+#else
+  MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size );
+  op->buffer[1] = op->buffer[0] + l->vector_size;  
+#endif
+
   for ( mu=0; mu<4; mu++ ) {
     le[mu] = l->local_lattice[mu];
     N[mu] = le[mu]+1;
@@ -434,801 +476,160 @@ void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder,
             op->num_even_sites++;
           }
         }
+  
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
 
-  MALLOC( op->D, complex_PRECISION, 4*nc_size*n );
-  MALLOC( op->clover, complex_PRECISION, lu_dec_size*n );
-#ifdef HAVE_TM
-  int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-  MALLOC( l->oe_op_PRECISION.tm_term, complex_PRECISION, tm_size*n );
+  MALLOC( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites );
+#ifdef HAVE_TM1p1
+  MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites );
+#endif
+
+#else
+  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION );
+#ifdef HAVE_TM1p1
+  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION );
 #endif
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // 2 is for complex, 4 is for 4 directions
-  MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n, 64 );
-  MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n, 64 );
-  MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->num_lattice_site_var*column_offset*n, 64 );
+
 #endif
 
-  coarse_oddeven_setup_PRECISION_set_couplings( in, reorder, l, no_threading );
-    
   // define data layout
-  MALLOC( op->index_table, int, N[T]*N[Z]*N[Y]*N[X] );
   eot = op->index_table;
-  
   define_eot( eot, N, l );
-    
+
   // neighbor table, translation table
-  MALLOC( op->neighbor_table, int, 5*N[T]*N[Z]*N[Y]*N[X] );
-  MALLOC( op->backward_neighbor_table, int, 5*N[T]*N[Z]*N[Y]*N[X] );
-  MALLOC( op->translation_table, int, le[T]*le[Z]*le[Y]*le[X] );
   nt = op->neighbor_table;
   tt = op->translation_table;
-  
   define_nt_bt_tt( nt, op->backward_neighbor_table, NULL, tt, eot, N, l );
-  
+
   // boundary table
-  for ( mu=0; mu<4; mu++ ) {
-    bs = 1;
-    le[mu] = 1;
-    for ( nu=0; nu<4; nu++ )
-      bs *= le[nu];
-    
-    MALLOC( op->c.boundary_table[2*mu], int, bs );
-    op->c.boundary_table[2*mu+1] = op->c.boundary_table[2*mu];
-    
-    le[mu] = l->local_lattice[mu];
-  }
-  
   bt = op->c.boundary_table;
   define_eo_bt( bt, eot, op->c.num_even_boundary_sites, op->c.num_odd_boundary_sites, op->c.num_boundary_sites, N, l );
 
-  MALLOC( op->buffer, complex_PRECISION*, 2 );
-  op->buffer[0] = NULL;
-  MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size );
-  op->buffer[1] = op->buffer[0] + l->vector_size;  
-  ghost_alloc_PRECISION( 0, &(op->c), l );
+  // ghost
   ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ;
+
+  // solver
   if ( l->level == 0 )
     l->p_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var;
   else
     l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var;
-}
-
-void coarse_oddeven_re_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, struct Thread *threading ) {
-  coarse_oddeven_setup_PRECISION_set_couplings( in, reorder, l, threading );
-}
-
 
-void coarse_oddeven_free_PRECISION( level_struct *l ) {
-  
-  int mu, nu, nc_size = SQUARE(l->num_lattice_site_var),
-      *ll = l->local_lattice, n = l->num_inner_lattice_sites, bs;
-  
-  ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l );
-  FREE( l->oe_op_PRECISION.D, complex_PRECISION, 4*nc_size*n );
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n );
-  FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n );
-  FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->num_lattice_site_var*column_offset*n );
-#endif
-#ifdef HAVE_TM
-  int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-  FREE( l->oe_op_PRECISION.tm_term, complex_PRECISION, tm_size*n );
-#endif
-  FREE( l->oe_op_PRECISION.clover, complex_PRECISION, nc_size*n );
-  FREE( l->oe_op_PRECISION.index_table, int, (ll[T]+1)*(ll[Z]+1)*(ll[Y]+1)*(ll[X]+1) );
-  FREE( l->oe_op_PRECISION.neighbor_table, int, 5*(ll[T]+1)*(ll[Z]+1)*(ll[Y]+1)*(ll[X]+1) );
-  FREE( l->oe_op_PRECISION.backward_neighbor_table, int, 5*(ll[T]+1)*(ll[Z]+1)*(ll[Y]+1)*(ll[X]+1) );
-  FREE( l->oe_op_PRECISION.translation_table, int, ll[T]*ll[Z]*ll[Y]*ll[X] );
-  
-  for ( mu=0; mu<4; mu++ ) {
-    bs = 1;
-    for ( nu=0; nu<4; nu++ )
-      if ( mu != nu )
-        bs *= ll[nu];
-      
-      FREE( l->oe_op_PRECISION.c.boundary_table[2*mu], int, bs );
-    l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL;
-  }
-  
-  FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size );
-  FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 );
 }
 
+void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, 
+                                     struct Thread *threading ) {
 
-void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                    const int amount, level_struct *l, struct Thread *threading ) {
+  operator_PRECISION_struct *op = &(l->oe_op_PRECISION);
 
-  START_NO_HYPERTHREADS(threading)
+  START_LOCKED_MASTER(threading)
+    int ns=l->num_inner_lattice_sites, nv = l->num_parent_eig_vect, i,
+    D_size = 4*SQUARE(2*nv),
+    clover_size = (nv)*(nv*2+1),
+    block_size = (nv)*(nv+1);
+  config_PRECISION D_in = in->D,
+    clover_in = in->clover,
+    odd_proj_in = in->odd_proj;
 
-  int mu, i, index, num_site_var=l->num_lattice_site_var,
-      num_4link_var=4*l->num_lattice_site_var*l->num_lattice_site_var,
-      num_link_var=l->num_lattice_site_var*l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-  config_PRECISION D_pt;
+  // neighbor couplings
+  if ( reorder ) {
+    int t, z, y, x, index, *le = l->local_lattice, oe_offset = op->oe_offset,
+      *it = in->index_table, *dt = in->table_dim;
+    config_PRECISION D_oe = op->D, 
+      D_eo = (op->D)+D_size*op->num_even_sites,
+      clover_ee = op->clover,
+      clover_oo = (op->clover)+clover_size*op->num_even_sites,
+      odd_proj_ee = op->odd_proj,
+      odd_proj_oo = op->odd_proj+block_size*op->num_even_sites;
 
-  int core_start;
-  int core_end;
-  
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  set_boundary_PRECISION( out, 0, l, threading );
-  
-  if ( amount == _EVEN_SITES ) {
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
-  
-  START_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-  
-  if ( amount == _EVEN_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-  
-  // compute U_mu^dagger coupling
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );    
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );    
-    }
-  }
-  END_LOCKED_MASTER(threading)
-  
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-  
-  // compute U_mu couplings
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index];
-    index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-    
-    D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-    
-    D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
+    for ( t=0; t<le[T]; t++ )
+      for ( z=0; z<le[Z]; z++ )
+        for ( y=0; y<le[Y]; y++ )
+          for ( x=0; x<le[X]; x++ ) {
+            index = site_index( t, z, y, x, dt, it );
+            if ( (t+z+y+x+oe_offset)%2 == 1 ) {
+              for ( i=0; i<D_size; i++ ) 
+                D_eo[i] = D_in[ index*D_size+i ];
+              for ( i=0; i<clover_size; i++ )
+                clover_oo[i] = clover_in[ index*clover_size+i ];
+              for ( i=0; i<block_size; i++ )
+                odd_proj_oo[i] = odd_proj_in[ index*block_size+i ];
+              D_eo += D_size;
+              clover_oo += clover_size;
+              odd_proj_oo += block_size;
+            } else {
+              for ( i=0; i<D_size; i++ )
+                D_oe[i] = D_in[ index*D_size+i ];
+              for ( i=0; i<clover_size; i++ )
+                clover_ee[i] = clover_in[ index*clover_size+i ];
+              for ( i=0; i<block_size; i++ )
+                odd_proj_ee[i] = odd_proj_in[ index*block_size+i ];
+              D_oe += D_size;
+              clover_ee += clover_size;
+              odd_proj_ee += block_size;
+            }
+          }
     
-    D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
+  } else {
+    for ( i=0; i<D_size*ns; i++ )
+      op->D[i] = D_in[i];
+    for ( i=0; i<clover_size*ns; i++ )
+      op->clover[i] = clover_in[i];
+    for ( i=0; i<block_size*ns; i++ ) {
+      op->odd_proj[i] = odd_proj_in[i];
     }
+    
   }
   END_LOCKED_MASTER(threading)
+  
+  op->m0 = in->m0;
 
-  END_NO_HYPERTHREADS(threading)
-}
-
-
-void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, struct Thread *threading ) {
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-#ifndef COMM_HIDING_COARSEOP
-  int sign = -1;
-  coarse_pn_hopping_term_PRECISION_vectorized( out, in, op, amount, l, sign, threading);
-#else
-  coarse_n_hopping_term_PRECISION_vectorized( out, in, op, amount, l, threading );
+#ifdef HAVE_TM
+  tm_term_PRECISION_setup( in->mu, in->mu_even_shift, in->mu_odd_shift, op, l, threading );
+#endif  
+#ifdef HAVE_TM1p1
+  epsbar_term_PRECISION_setup( in->epsbar, in->epsbar_ig5_even_shift, in->epsbar_ig5_odd_shift, op, l, threading );
 #endif
-  return;
-#else
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, index, num_site_var=l->num_lattice_site_var,
-      num_4link_var=4*l->num_lattice_site_var*l->num_lattice_site_var,
-      num_link_var=l->num_lattice_site_var*l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-  config_PRECISION D_pt;
-
-  int core_start;
-  int core_end;
-  
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  set_boundary_PRECISION( out, 0, l, threading );
   
-  if ( amount == _EVEN_SITES ) {
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
+  coarse_oddeven_PRECISION_set_couplings( l, threading );
   
-  START_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-  
-  if ( amount == _EVEN_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-  
-  // compute U_mu^dagger coupling
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );    
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );    
-    }
-  }
-  END_LOCKED_MASTER(threading)
-  
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-  
-  // compute U_mu couplings
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
-    D_pt = op->D + num_4link_var*op->neighbor_table[index];
-    index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-    
-    D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-    
-    D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-    
-    D_pt += num_link_var;
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l );
-  }
-  
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-
-  END_NO_HYPERTHREADS(threading)
-#endif
 }
 
 
-void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                    const int amount, level_struct *l, struct Thread *threading ) {
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, index, num_site_var=l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-
-  OPERATOR_TYPE_PRECISION *D_vectorized;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset;
-
-  int core_start;
-  int core_end;
-
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  set_boundary_PRECISION( out, 0, l, threading );
-
-  if ( amount == _EVEN_SITES ) {
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
-
-  START_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // compute U_mu^dagger coupling
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
+void coarse_oddeven_free_PRECISION( level_struct *l ) {
+  
+  int nv = l->num_parent_eig_vect, vs = l->vector_size;
+  operator_PRECISION_struct *op = &(l->oe_op_PRECISION);
 
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // compute U_mu couplings
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index];
-    index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    D_vectorized += vectorized_link_offset;
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    D_vectorized += vectorized_link_offset;
-    coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
+  operator_PRECISION_free( op, _ODDEVEN, l );
+  coarse_operator_PRECISION_free_vectorized( op, l );
 
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
+#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
 
-  END_NO_HYPERTHREADS(threading)
+  FREE( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites );
+#ifdef HAVE_TM1p1
+  FREE( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites );
 #endif
-}
-
-
-void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                    const int amount, level_struct *l, int sign, struct Thread *threading ) {
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, num_site_var=l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-
-  OPERATOR_TYPE_PRECISION *D_vectorized;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int link_offset = 2*l->num_lattice_site_var*column_offset;
-  int *neighbor_fw = op->neighbor_table;
-  int *neighbor_bw = op->backward_neighbor_table;
-
-  int core_start;
-  int core_end;
-
-  void (*coarse_hopp)(vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, level_struct *l);
-  if(sign == +1)
-    coarse_hopp = coarse_hopp_PRECISION_vectorized;
-  else
-    coarse_hopp = coarse_n_hopp_PRECISION_vectorized;
-
-
-  if ( l->num_processes > 1 && op->c.comm ) {
-    set_boundary_PRECISION( out, 0, l, threading );
-
-    if ( amount == _EVEN_SITES ) {
-      minus_dir_param = _ODD_SITES;
-      plus_dir_param = _EVEN_SITES;
-    } else if ( amount == _ODD_SITES ) {
-      minus_dir_param = _EVEN_SITES;
-      plus_dir_param = _ODD_SITES;
-    }
-
-    START_MASTER(threading)
-    for ( mu=0; mu<4; mu++ ) {
-      // send in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-    END_MASTER(threading)
-
-    if ( amount == _EVEN_SITES ) {
-      start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-    } else if ( amount == _ODD_SITES ) {
-      start = 0; num_lattice_sites = op->num_even_sites;
-    }
-    compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-    // prepare for sending to fw: compute hopping terms into forward boundary buffer
-    for ( i=core_start; i<core_end; i++ ) {
-      for(int mu=0; mu<4; mu++) {
-        if(neighbor_fw[5*i+1+mu] < l->num_inner_lattice_sites)
-          continue;
-        out_pt = out + num_site_var*neighbor_fw[5*i+1+mu];
-        in_pt = in + num_site_var*neighbor_fw[5*i];
-        D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset;
-        coarse_hopp( out_pt, in_pt, D_vectorized, l );
-      }
-    }
-    START_LOCKED_MASTER(threading)
-    for ( mu=0; mu<4; mu++ ) {
-      // send in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-    END_LOCKED_MASTER(threading)
-  }
-  else
-    SYNC_CORES(threading)
-
-
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  for ( i=core_start; i<core_end; i++ ) {
-    out_pt = out + num_site_var*neighbor_fw[5*i];
-
-    // U_mu^dagger coupling
-    for(int mu=0; mu<4; mu++) {
-      // terms coming from backward boundary buffer are done by the ghost_wait_PRECISION call below
-      if(neighbor_bw[5*i+1+mu] >= l->num_inner_lattice_sites)
-        continue;
-      D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_bw[5*i+1+mu] + mu*link_offset;
-      in_pt = in + num_site_var*neighbor_bw[5*i+1+mu];
-      coarse_hopp( out_pt, in_pt, D_vectorized, l );
-    }
-
-    // compute U_mu couplings
-    for(int mu=0; mu<4; mu++) {
-      D_vectorized = op->D_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset;
-      in_pt = in + num_site_var*neighbor_fw[5*i+1+mu];
-      coarse_hopp( out_pt, in_pt, D_vectorized, l );
-    }
-  }
-
-
-  // wait for terms from bw and add them
-  if ( l->num_processes > 1 && op->c.comm ) {
-    START_LOCKED_MASTER(threading)
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    END_LOCKED_MASTER(threading)
-  }
-  else
-    SYNC_CORES(threading)
 
-  END_NO_HYPERTHREADS(threading)
+#else
+  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites );
+#ifdef HAVE_TM1p1
+  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites );
 #endif
-}
-
-
-void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, struct Thread *threading ) {
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  START_NO_HYPERTHREADS(threading)
-
-  int mu, i, index, num_site_var=l->num_lattice_site_var,
-      start=0, num_lattice_sites=l->num_inner_lattice_sites,
-      plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  vector_PRECISION in_pt, out_pt;
-
-  OPERATOR_TYPE_PRECISION *D_vectorized;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset;
-
-  int core_start;
-  int core_end;
-
-  // assumptions (1) self coupling has already been performed
-  //          OR (2) "out" is initialized with zeros
-  set_boundary_PRECISION( out, 0, l, threading );
-
-  if ( amount == _EVEN_SITES ) {
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
-
-  START_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in -mu direction
-      ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // D is applied in an input-centric way
-  // this makes threading a bit ugly, is there a better way?
-  // compute U_mu^dagger coupling
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+T];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-  SYNC_CORES(threading)
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    in_pt = in + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset;
-    index++;
-    out_pt = out + num_site_var*op->neighbor_table[index+X];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // communicate in +mu direction
-      ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for -mu direction
-      ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-
-  if ( amount == _EVEN_SITES ) {
-    start = 0; num_lattice_sites = op->num_even_sites;
-  } else if ( amount == _ODD_SITES ) {
-    start = op->num_even_sites, num_lattice_sites = op->num_odd_sites;
-  }
-  compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1);
-
-  // compute U_mu couplings
-  for ( i=core_start; i<core_end; i++ ) {
-    index = 5*i;
-    out_pt = out + num_site_var*op->neighbor_table[index];
-    D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index];
-    index++;
-    in_pt = in + num_site_var*op->neighbor_table[index+T];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+Z];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+Y];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-
-    D_vectorized += vectorized_link_offset;
-    in_pt = in + num_site_var*op->neighbor_table[index+X];
-    coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l );
-  }
-
-  START_LOCKED_MASTER(threading)
-  if ( op->c.comm ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // wait for +mu direction
-      ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
 
-  END_NO_HYPERTHREADS(threading)
 #endif
+  
+#ifdef HAVE_TM1p1
+  FREE( op->buffer[0], complex_PRECISION, 4*vs );
+#else
+  FREE( op->buffer[0], complex_PRECISION, 2*vs );
+#endif
+  FREE( op->buffer, complex_PRECISION*, 2 );
 }
 
-
 void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
   SYNC_CORES(threading)
@@ -1236,14 +637,14 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS
   coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
   PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, -1, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   
   fgmres_PRECISION( p, l, threading );
   
   // even to odd
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, -1, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   PROF_PRECISION_START( _SC, threading );
   coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
@@ -1251,80 +652,76 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS
   SYNC_CORES(threading)
 }
 
-
 void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
     
-  // start and end indices for vector functions depending on thread
-  int start;
-  int end;
-  // compute start and end indices for core
-  // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
-  compute_core_start_end(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start, &end, l, threading);
-
+  int start = op->num_even_sites*l->num_lattice_site_var;
+  int end = l->inner_vector_size;
   vector_PRECISION *tmp = op->buffer;
   
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
   coarse_diag_ee_PRECISION( out, in, op, l, threading );
   PROF_PRECISION_STOP( _SC, 0, threading );
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start, end, l );
+  SYNC_CORES(threading);
+  vector_PRECISION_define_zero( tmp[0], start, end, l, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, +1, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   PROF_PRECISION_START( _SC, threading );
   coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading );
   PROF_PRECISION_STOP( _SC, 1, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, -1, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
 }
 
 
 void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
-  int start_even, end_even, start_odd, end_odd;
-  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var );
-  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var );
+  int start_even = 0, end_even = op->num_even_sites*l->num_lattice_site_var, 
+    start_odd = end_even, end_odd = l->inner_vector_size;
+  int thread_start_even, thread_end_even, thread_start_odd, thread_end_odd;
+  compute_core_start_end_custom( start_even, end_even, &thread_start_even, &thread_end_even, l, threading, l->num_lattice_site_var );
+  compute_core_start_end_custom( start_odd, end_odd, &thread_start_odd, &thread_end_odd, l, threading, l->num_lattice_site_var );
   
   vector_PRECISION tmp = op->buffer[0];
   
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp, 0, start_even, end_even, l );
+  SYNC_CORES(threading);
+  vector_PRECISION_define_zero( tmp, start_even, end_even, l, threading );
   
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
-  coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l );
+  coarse_gamma5_PRECISION( p->b, p->b, thread_start_odd, thread_end_odd, l );
   SYNC_CORES(threading)
   coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
   SYNC_CORES(threading)
-  coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l );
+  coarse_gamma5_PRECISION( p->b, p->b, thread_start_odd, thread_end_odd, l );
   PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, -1, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
-  coarse_gamma5_PRECISION( tmp, tmp, start_even, end_even, l );
+  coarse_gamma5_PRECISION( tmp, tmp, thread_start_even, thread_end_even, l );
   SYNC_CORES(threading)
-  vector_PRECISION_plus( p->b, p->b, tmp, start_even, end_even, l );
+  vector_PRECISION_plus( p->b, p->b, tmp, thread_start_even, thread_end_even, l );
   
   fgmres_PRECISION( p, l, threading );
   SYNC_CORES(threading)
-  coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l );
+  coarse_gamma5_PRECISION( p->b, p->b, thread_start_odd, thread_end_odd, l );
   SYNC_CORES(threading)
   coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading );
   SYNC_CORES(threading)
   
   // even to odd
   PROF_PRECISION_START( _NC, threading );
-  vector_PRECISION_define( tmp, 0, start_odd, end_odd, l );
-  SYNC_CORES(threading)
-  coarse_n_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading );
+  vector_PRECISION_define_zero( tmp, start_odd, end_odd, l, threading );
+  SYNC_CORES(threading);
+  coarse_pn_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, -1, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
   coarse_diag_oo_inv_PRECISION( p->b, tmp, op, l, threading );
-  vector_PRECISION_plus( p->x, p->x, p->b, start_odd, end_odd, l );
+  vector_PRECISION_plus( p->x, p->x, p->b, thread_start_odd, thread_end_odd, l );
   
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
@@ -1333,9 +730,10 @@ void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PR
 
 void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
     
-  int start_even, end_even, start_odd, end_odd;
-  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var );
-  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var );
+  int start_even = 0, end_even = op->num_even_sites*l->num_lattice_site_var, 
+    start_odd = end_even, end_odd = l->inner_vector_size;
+  int thread_start_even, thread_end_even;
+  compute_core_start_end_custom( start_even, end_even, &thread_start_even, &thread_end_even, l, threading, l->num_lattice_site_var );
 
   vector_PRECISION *tmp = op->buffer;
   
@@ -1343,20 +741,20 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P
   PROF_PRECISION_START( _SC, threading );
   coarse_diag_ee_PRECISION( out, in, op, l, threading );
   PROF_PRECISION_STOP( _SC, 0, threading );
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l );
+  SYNC_CORES(threading);
+  vector_PRECISION_define_zero( tmp[0], start_odd, end_odd, l, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, +1, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   PROF_PRECISION_START( _SC, threading );
   coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading );
   PROF_PRECISION_STOP( _SC, 1, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading );
+  coarse_pn_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, -1, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   SYNC_CORES(threading)
-  coarse_gamma5_PRECISION( out, out, start_even, end_even, l );
+  coarse_gamma5_PRECISION( out, out, thread_start_even, thread_end_even, l );
   SYNC_CORES(threading)
 }
 
@@ -1373,10 +771,10 @@ void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in,
     // transformation part
     vector_PRECISION_copy( buf1, in, 0, l->inner_vector_size, l );
     // even to odd
-    vector_PRECISION_define( out, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
-    END_LOCKED_MASTER(threading)
+    vector_PRECISION_define_zero( out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l, no_threading );
+    END_LOCKED_MASTER(threading);
 
-    coarse_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading );
+    coarse_pn_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, +1, l, threading );
     coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading );
 
     START_LOCKED_MASTER(threading)
@@ -1398,13 +796,13 @@ void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in,
     if ( g.method == 6 ) {
       START_LOCKED_MASTER(threading)
       coarse_gamma5_PRECISION( out, out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
-      vector_PRECISION_define( buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
-      coarse_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading );
+      vector_PRECISION_define_zero( buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l, no_threading );
+      coarse_pn_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, +1, l, no_threading );
       coarse_gamma5_PRECISION( buf1, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
       vector_PRECISION_plus( out, out, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l );
       END_LOCKED_MASTER(threading)
     } else {
-      coarse_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading );
+      coarse_pn_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, +1, l, threading );
     }
 
     PUBLIC_FREE( buf1, complex_PRECISION, 2*l->vector_size );
diff --git a/src/coarse_oddeven_generic.h b/src/coarse_oddeven_generic.h
index 807632a..2d9e687 100644
--- a/src/coarse_oddeven_generic.h
+++ b/src/coarse_oddeven_generic.h
@@ -23,37 +23,24 @@
   #define COARSE_ODDEVEN_PRECISION_HEADER
 
   struct Thread;
+
+  void coarse_oddeven_alloc_PRECISION( level_struct *l );
   
-#ifndef HAVE_TM  
-  void coarse_selfcoupling_LU_decomposition_PRECISION( config_PRECISION output, config_PRECISION input, level_struct *l );
-#else
-  void coarse_selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, config_PRECISION input, config_PRECISION input_anti, level_struct *l );
-#endif
-  void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION A, level_struct *l );
-  
-  void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l );
-  void coarse_oddeven_re_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, struct Thread *threading );
+  void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l,
+                                       struct Thread *threading );
+  void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread *threading );
+
   void coarse_oddeven_free_PRECISION( level_struct *l );
   
-  void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  
-  void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, struct Thread *threading );
-  void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                        const int amount, level_struct *l, struct Thread *threading );
-  void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, struct Thread *threading );
-  void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                      const int amount, level_struct *l, int sign, struct Thread *threading );
-  void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-                                        const int amount, level_struct *l, struct Thread *threading );
-  
-  void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1, level_struct *l, struct Thread *threading );
+  void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, 
+                                        struct Thread *threading );
+  void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in,
+                                          operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op,
+                                            level_struct *l, struct Thread *threading );
+  void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in,
+                                           operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1,
+                                       level_struct *l, struct Thread *threading );
   
 #endif
diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c
index 98a56fc..9786b5e 100644
--- a/src/coarse_operator_generic.c
+++ b/src/coarse_operator_generic.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
  * 
  * This file is part of the DDalphaAMG solver library.
  * 
@@ -24,34 +24,46 @@
 void coarse_operator_PRECISION_alloc( level_struct *l ) {
   
   int nd = l->next_level->num_inner_lattice_sites,
-      k = l->next_level->num_lattice_site_var;  
+      k = l->next_level->num_parent_eig_vect*2;  
   l->next_level->D_size = k*k*4*nd;
   l->next_level->clover_size = ((k*(k+1))/2)*nd;
-#ifdef HAVE_TM
   l->next_level->block_size = ((k/2*(k/2+1)))*nd;
-#endif
   
   operator_PRECISION_alloc( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level );
-}
 
+}
 
 void coarse_operator_PRECISION_free( level_struct *l ) {
   
   operator_PRECISION_free( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level );
   
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  operator_PRECISION_struct *op = &(l->next_level->s_PRECISION.op);
+  coarse_operator_PRECISION_free_vectorized( &(l->next_level->s_PRECISION.op), l->next_level );
+}
+
+void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ) {
+
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
   if( op->D_vectorized != NULL ) {
-    int n2 = 2*l->next_level->num_lattice_sites-l->next_level->num_inner_lattice_sites, n = l->next_level->num_inner_lattice_sites;
-    int column_offset = SIMD_LENGTH_PRECISION*((l->next_level->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+    int n2 = (l->depth>0 && l->level>0) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites;
+    int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
     // 2 is for complex, 4 is for 4 directions
-    FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->next_level->num_lattice_site_var*column_offset*n2 );
-    FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->next_level->num_lattice_site_var*column_offset*n2 );
-    FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->next_level->num_lattice_site_var*column_offset*n );
+    FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 );
+    FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 );
   }
 #endif
-}
 
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+  if( op->clover_vectorized != NULL ) {
+    int n = l->num_inner_lattice_sites;
+    int column_offset = SIMD_LENGTH_PRECISION*((2*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+    FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*2*l->num_parent_eig_vect*column_offset*n );
+#ifdef HAVE_TM1p1
+    int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+    FREE_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_parent_eig_vect*column_doublet_offset*n );
+#endif
+  }
+#endif
+}
 
 void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
   
@@ -61,10 +73,12 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
   vector_PRECISION buffer1 = l->vbuf_PRECISION[4], buffer2 = l->vbuf_PRECISION[5];
   
   int mu, n = l->num_eig_vect, i, j,
-      D_size = l->next_level->D_size,
-      clover_size = l->next_level->clover_size;
+    D_size = l->next_level->D_size,
+    clover_size = l->next_level->clover_size,
+    block_size = l->next_level->block_size;
   void (*aggregate_self_coupling)() = (l->depth==0)?d_plus_clover_aggregate_PRECISION:coarse_aggregate_self_couplings_PRECISION,
        (*aggregate_neighbor_coupling)() = (l->depth==0)?d_neighbor_aggregate_PRECISION:coarse_aggregate_neighbor_couplings_PRECISION;
+  void (*aggregate_block)() = (l->depth==0)?diagonal_aggregate_PRECISION:coarse_aggregate_block_diagonal_PRECISION;
    
   operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level );
     
@@ -72,17 +86,8 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
     l->next_level->op_PRECISION.D[j] = _COMPLEX_PRECISION_ZERO;
   for ( j=0; j<clover_size; j++ )
     l->next_level->op_PRECISION.clover[j] = _COMPLEX_PRECISION_ZERO;
-#ifdef HAVE_TM
-  int block_size = l->next_level->block_size;
-
-  void (*aggregate_tm_term)() = (l->depth==0)?diagonal_aggregate_PRECISION:coarse_aggregate_anti_block_diagonal_PRECISION,
-    (*aggregate_odd_proj)() = (l->depth==0)?diagonal_aggregate_PRECISION:coarse_aggregate_block_diagonal_PRECISION;
-
   for ( j=0; j<block_size; j++ )
-    l->next_level->op_PRECISION.tm_term[j] = _COMPLEX_PRECISION_ZERO;
-  for ( j=0; j<block_size; j++ )
-    l->next_level->op_PRECISION.odd_proj[j] = _COMPLEX_PRECISION_ZERO;  
-#endif
+    l->next_level->op_PRECISION.odd_proj[j] = _COMPLEX_PRECISION_ZERO;
   
   // for all test vectors V[i]:
   for ( i=0; i<n; i++ ) {
@@ -94,15 +99,10 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
     aggregate_self_coupling( buffer1, buffer2, V[i], &(l->s_PRECISION), l );
     // calculate selfcoupling entries of the coarse grid operator
     set_coarse_self_coupling_PRECISION( buffer1, buffer2, V, i, l );
-#ifdef HAVE_TM    
-    //tm_term
-    aggregate_tm_term( buffer1, buffer2, V[i], l->s_PRECISION.op.tm_term, l );
-    set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.tm_term, l );
     //odd_proj
-    aggregate_odd_proj( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l );
+    aggregate_block( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l );
     set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l );
-#endif
-
+ 
     for ( mu=0; mu<4; mu++ ) {
       // finish updating ghostcells of V[i]
       negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l );      
@@ -112,13 +112,45 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) {
     }
   }
   
+  coarse_operator_PRECISION_setup_finalize( l, no_threading );
+
   t1 = MPI_Wtime();
   if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 );
 
 }
 
+void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading ) {
+
+  int block_size = l->next_level->block_size;
+  
+  l->next_level->op_PRECISION.m0 = l->s_PRECISION.op.m0;
+#ifdef HAVE_TM    
+  //tm_term
+  PRECISION mf = (g.mu_factor[l->depth]) ? g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth]:0;
+  if ( mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_even_shift == 0 &&
+       mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_odd_shift == 0 )
+    vector_PRECISION_define_zero( l->next_level->op_PRECISION.tm_term, 0, block_size, l->next_level, threading );
+  else
+    tm_term_PRECISION_setup( mf*l->s_PRECISION.op.mu, mf*l->s_PRECISION.op.mu_even_shift,
+                             mf*l->s_PRECISION.op.mu_odd_shift, &(l->next_level->op_PRECISION),
+                             l->next_level, threading ); 
+#endif
+#ifdef HAVE_TM1p1
+  //eps_term
+  PRECISION ef = (g.epsbar_factor[l->depth]) ? g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth]:0; 
+  if ( ef*l->s_PRECISION.op.epsbar == 0 &&  ef*l->s_PRECISION.op.epsbar_ig5_even_shift == 0 &&
+       ef*l->s_PRECISION.op.epsbar_ig5_odd_shift == 0 )
+    vector_PRECISION_define_zero( l->next_level->op_PRECISION.epsbar_term, 0, block_size, l->next_level, threading );
+  else
+    epsbar_term_PRECISION_setup( ef*l->s_PRECISION.op.epsbar, ef*l->s_PRECISION.op.epsbar_ig5_even_shift,
+                                 ef*l->s_PRECISION.op.epsbar_ig5_odd_shift, &(l->next_level->op_PRECISION),
+                                 l->next_level, threading );
+#endif
+
+}
+
 void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, 
-				   vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) {
+                                   vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) {
   
   // U(x) = [ A 0      , A=A*, D=D*
   //          0 D ]
@@ -126,9 +158,10 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION s
   // suitable for tm_term and odd_proj
   
   int i, j, k, m, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
-      num_eig_vect = l->next_level->num_lattice_site_var/2,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2,
-      block_site_size = (num_eig_vect*(num_eig_vect+1));
+    num_eig_vect = l->next_level->num_parent_eig_vect,
+    aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, 
+    offset = l->num_parent_eig_vect,
+    block_site_size = (num_eig_vect*(num_eig_vect+1));
   vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
   config_PRECISION block_pt;
 
@@ -157,9 +190,10 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI
                                          vector_PRECISION *V, const int n, level_struct *l ) {
   
   int i, j, k, m, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
-      num_eig_vect = l->next_level->num_lattice_site_var/2,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2,
-      clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2;
+    num_eig_vect = l->next_level->num_parent_eig_vect,
+    aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates,
+    offset = l->num_parent_eig_vect,
+    clover_site_size = (num_eig_vect*(2*num_eig_vect+1));
   vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data;
   config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover;  
   
@@ -211,8 +245,8 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P
                                              vector_PRECISION *V, const int mu, const int n, level_struct *l ) {
   
   int i, i1, j, k, k1, k2, m, num_aggregates = l->is_PRECISION.num_agg,
-      num_eig_vect = l->next_level->num_lattice_site_var/2,
-      offset = l->num_lattice_site_var/2, nlsv = l->num_lattice_site_var,
+      num_eig_vect = l->next_level->num_parent_eig_vect,
+      offset = l->num_parent_eig_vect, nlsv = l->num_parent_eig_vect*2,
       D_link_size = num_eig_vect*num_eig_vect*4, *index_dir = l->is_PRECISION.agg_boundary_index[mu],
       aggregate_boundary_sites = l->is_PRECISION.agg_boundary_length[mu]/num_aggregates;
       
@@ -261,42 +295,52 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P
   }
 }
 
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
 void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
 
   int n = s->num_block_sites, *length = s->dir_length, **index = s->index,
-      *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var;
+    *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var, num_eig_vect = l->num_parent_eig_vect;
   vector_PRECISION lphi = phi+start, leta = eta+start;
-  int hopp_size = 4 * SQUARE( l->num_lattice_site_var );
-  config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size;
   
   // site-wise self coupling
-  int clov_size = ( (l->num_lattice_site_var*(l->num_lattice_site_var+1))/2 );
-  config_PRECISION clover = s->op.clover + (start/m)*clov_size;
-  coarse_self_couplings_PRECISION( leta, lphi, clover, n*m, l );
+  coarse_self_couplings_PRECISION( eta, phi, &(s->op), (start/m), (start/m)+n, l);
 
-#ifdef HAVE_TM
-  int tm_term_size = ( (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)) );
-  config_PRECISION tm_term = s->op.tm_term + (start/m)*tm_term_size;
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    coarse_add_anti_block_diagonal_PRECISION( leta, lphi, tm_term, n*m, l );
-#endif
   // inner block couplings
+#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+  int hopp_size = 4 * SQUARE( num_eig_vect*2 );
+  config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size;
+
   for ( int mu=0; mu<4; mu++ ) {
     ind = index[mu]; // mu direction
     for ( int i=0; i<length[mu]; i++ ) {
       int k = ind[i]; int j = neighbor[5*k+mu+1];
       D_pt = D + hopp_size*k + (hopp_size/4)*mu;
-      coarse_hopp_PRECISION( leta+m*k, lphi+m*j, D_pt, l );
-      coarse_daggered_hopp_PRECISION( leta+m*j, lphi+m*k, D_pt, l );
+      coarse_pn_hopp_PRECISION( leta+m*k, lphi+m*j, D_pt, +1, l );
+      coarse_pn_daggered_hopp_PRECISION( leta+m*j, lphi+m*k, D_pt, +1, l );
     }
   }
+#else
+  int column_offset = 2*SIMD_LENGTH_PRECISION*((num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int vectorized_link_offset = 2*2*num_eig_vect*column_offset;
+  for ( int mu=0; mu<4; mu++ ) {
+    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized +
+      (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset;
+    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized +
+      (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset;
+    ind = index[mu]; // mu direction
+    for ( int i=0; i<length[mu]; i++ ) {
+      int k = ind[i]; int j = neighbor[5*k+mu+1];
+      // hopp
+      coarse_pn_hopp_PRECISION_vectorized( leta+m*k, lphi+m*j, Dplus + 4*vectorized_link_offset*k, +1, l );
+      // daggered hopp
+      coarse_pn_hopp_PRECISION_vectorized( leta+m*j, lphi+m*k, Dminus + 4*vectorized_link_offset*k, +1, l );
+    }
+  }
+#endif
 
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
 
 
 void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
@@ -307,8 +351,8 @@ void coarse_aggregate_self_couplings_PRECISION( vector_PRECISION eta1, vector_PR
   vector_PRECISION eta1_pt, eta2_pt, phi_pt;
   config_PRECISION D_pt, D = s->op.D;
   
-  vector_PRECISION_define( eta1, 0, 0, l->vector_size, l );
-  vector_PRECISION_define( eta2, 0, 0, l->vector_size, l );  
+  vector_PRECISION_define_zero( eta1, 0, l->vector_size, l, no_threading );
+  vector_PRECISION_define_zero( eta2, 0, l->vector_size, l, no_threading );  
   coarse_spinwise_self_couplings_PRECISION( eta1, eta2, phi, s->op.clover, l->inner_vector_size, l );
   
   for ( mu=0; mu<4; mu++ ) { // direction mu
@@ -334,8 +378,8 @@ void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vecto
   vector_PRECISION eta1_pt, eta2_pt, phi_pt;
   config_PRECISION D_pt, D = s->op.D;
   
-  vector_PRECISION_define( eta1, 0, 0, l->vector_size, l );
-  vector_PRECISION_define( eta2, 0, 0, l->vector_size, l ); 
+  vector_PRECISION_define_zero( eta1, 0, l->vector_size, l, no_threading );
+  vector_PRECISION_define_zero( eta2, 0, l->vector_size, l, no_threading ); 
   
   // requires the positive boundaries of phi to be communicated befor
   for ( i=0; i<length; i++ ) {
@@ -348,40 +392,10 @@ void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vecto
 }
 
 
-void coarse_self_couplings_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-                                      config_PRECISION clover, int length, level_struct *l ) {
-
-  int site_var = l->num_lattice_site_var,
-      num_eig_vect = l->num_lattice_site_var/2,
-      clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2,
-      clover_step_size2 = SQUARE(l->num_lattice_site_var/2);
-  config_PRECISION clover_pt = clover;
-  vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  while ( phi_pt < phi_end_pt ) {
-    // A
-    mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
-    clover_pt += clover_step_size1; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
-    // D
-    mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
-    clover_pt += clover_step_size1; phi_pt -= num_eig_vect;
-    // C = -B*
-    nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
-    phi_pt += num_eig_vect; eta_pt -= num_eig_vect;
-    // B
-    mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
-    clover_pt += clover_step_size2; phi_pt += num_eig_vect; eta_pt += site_var;
-  }
-}
-
-
 void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
                                                 config_PRECISION block, level_struct *l ) {
   int length = l->inner_vector_size,
-    num_eig_vect = l->num_lattice_site_var/2,
+    num_eig_vect = l->num_parent_eig_vect,
     block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
   config_PRECISION block_pt = block;
   vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length;
@@ -389,71 +403,26 @@ void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PR
   //          0 D ]
   // storage order: upper triangle of A, upper triangle of D, columnwise
   // diagonal coupling
-  vector_PRECISION_define( eta1_pt, 0, 0, l->vector_size, l );
-  vector_PRECISION_define( eta2_pt, 0, 0, l->vector_size, l );
   while ( phi_pt < phi_end_pt ) {
     // A
     mvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect );
+    vector_PRECISION_define_zero( eta2_pt, 0, num_eig_vect, l, no_threading );
     block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect;
     // D
+    vector_PRECISION_define_zero( eta1_pt, 0, num_eig_vect, l, no_threading );
     mvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect );
     block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect;
   }
 }
 
 
-void coarse_aggregate_anti_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
-                                                config_PRECISION block, level_struct *l ) {
-  int length = l->inner_vector_size,
-    num_eig_vect = l->num_lattice_site_var/2,
-    block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
-  config_PRECISION block_pt = block;
-  vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length;
-  // U(x) = [ A 0      , A=-A*, D=-D* diag. excluded
-  //          0 D ]
-  // storage order: upper triangle of A, upper triangle of D, columnwise
-  // diagonal coupling
-  vector_PRECISION_define( eta1_pt, 0, 0, l->vector_size, l );
-  vector_PRECISION_define( eta2_pt, 0, 0, l->vector_size, l );
-  while ( phi_pt < phi_end_pt ) {
-    // A
-    amvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect );
-    block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect;
-    // D
-    amvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect );
-    block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect;
-  }
-}
-
-
-void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-					       config_PRECISION block, int length, level_struct *l ) {
-
-  int num_eig_vect = l->num_lattice_site_var/2,
-    block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
-  config_PRECISION block_pt = block;
-  vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
-  // U(x) = [ A 0      , A=-A*, D=-D* diag. excluded
-  //          0 D ]
-  // storage order: upper triangle of A, upper triangle of D, columnwise
-  // diagonal coupling
-  while ( phi_pt < phi_end_pt ) {
-    // A
-    amvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
-    block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
-    // D
-    amvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
-    block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
-  }
-}
-
 
 void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi,
                                                config_PRECISION clover, int length, level_struct *l ) {
   
-  int num_eig_vect = l->num_lattice_site_var/2,
+  int num_eig_vect = l->num_parent_eig_vect,
       clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2,
-      clover_step_size2 = SQUARE(l->num_lattice_site_var/2);
+      clover_step_size2 = SQUARE(num_eig_vect);
   config_PRECISION clover_pt = clover;
   vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2+num_eig_vect, phi_end_pt=phi+length;
   // U(x) = [ A B      , A=A*, D=D*, C = -B*
@@ -476,12 +445,127 @@ void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRE
   }
 }
 
+void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l,
+                                              struct Thread *threading ) {
+
+  coarse_operator_PRECISION_set_neighbor_couplings( op, l, threading );
+  coarse_operator_PRECISION_set_self_couplings( op, l, threading );
+
+}
+
+void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l,
+                                                       struct Thread *threading ) {
+
+#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+  int nc_size = SQUARE(l->num_parent_eig_vect*2);
+  int n1, n2;
+  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); 
+  int offset_v = 4*l->num_parent_eig_vect*column_offset;
+  
+  if ( l->depth > 0 && l->level>0 ) {
+    n1 = l->num_lattice_sites;
+    n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites;
+  } else {
+    n1 = l->num_inner_lattice_sites;
+    n2 = l->num_inner_lattice_sites;
+  }
+  int start, end;
+  compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1);
+  int n_per_core = end-start;
+  START_LOCKED_MASTER(threading)
+  if( op->D_vectorized == NULL ) {
+    // 2 is for complex, 4 is for 4 directions
+    MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 );
+    MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 );
+  }
+  END_LOCKED_MASTER(threading)
+
+  copy_coarse_operator_to_vectorized_layout_PRECISION(
+      op->D + 4*start*nc_size,
+      op->D_vectorized + 4*start*offset_v,
+      n_per_core, l->num_parent_eig_vect);
+  copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
+      op->D + 4*start*nc_size,
+      op->D_transformed_vectorized + 4*start*offset_v,
+      n_per_core, l->num_parent_eig_vect);
+  // vectorize negative boundary
+  if ( n2>n1 ) {
+    compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1);
+    n_per_core = end-start;
+    copy_coarse_operator_to_vectorized_layout_PRECISION(
+        op->D + 4*start*nc_size,
+        op->D_vectorized + 4*start*offset_v,
+        n_per_core, l->num_parent_eig_vect);
+    copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
+        op->D + 4*start*nc_size,
+        op->D_transformed_vectorized + 4*start*offset_v,
+        n_per_core, l->num_parent_eig_vect);
+  }
+  SYNC_CORES(threading)
+#endif
+
+}
+
+void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, 
+                                                   struct Thread *threading ) {
+    
+#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION
+  int n = l->num_inner_lattice_sites, nv = l->num_parent_eig_vect;
+  int sc_size = (nv)*(nv*2+1);
+  int start, end;
+  compute_core_start_end_custom(0, n, &start, &end, l, threading, 1);
+  int n_per_core = end-start;
 
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
-void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { }
-void coarse_operator_PRECISION_set_couplings_clover( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { }
+  int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int offset_v = 2*2*nv*column_offset;
+  if( op->clover_vectorized == NULL ) {
+    START_LOCKED_MASTER(threading)
+    MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, offset_v*n, 64 );
+    END_LOCKED_MASTER(threading)
+  }
+  copy_coarse_operator_clover_to_vectorized_layout_PRECISION(
+      op->clover + start*sc_size,
+      op->clover_vectorized + start*offset_v,
+      n_per_core, nv);
+#ifdef HAVE_TM
+  int tm_size = (nv)*(nv+1);
+  if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 )
+    add_tm_term_to_vectorized_layout_PRECISION( 
+        op->tm_term + start*tm_size,
+        op->clover_vectorized + start*offset_v,
+        n_per_core, nv);
 #endif
 
+#ifdef HAVE_TM1p1
+  int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int offset_doublet_v = 2*4*nv*column_doublet_offset;
+  int eps_size = (nv)*(nv+1);
+  if( op->clover_doublet_vectorized == NULL ) {
+    START_LOCKED_MASTER(threading)
+    MALLOC_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, offset_doublet_v*n, 64 );
+    END_LOCKED_MASTER(threading)
+  }
+  copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION(
+      op->clover + start*sc_size,
+      op->clover_doublet_vectorized + start*offset_doublet_v,
+      n_per_core, nv);
+  if ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 )
+    add_epsbar_term_to_doublet_vectorized_layout_PRECISION(
+        op->epsbar_term + start*eps_size,
+        op->clover_doublet_vectorized + start*offset_doublet_v,
+        n_per_core, nv);
+#ifdef HAVE_TM
+  if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 )
+    add_tm_term_to_doublet_vectorized_layout_PRECISION(
+        op->tm_term + start*tm_size,
+        op->clover_doublet_vectorized + start*offset_doublet_v,
+        n_per_core, nv);
+#endif
+#endif
+  SYNC_CORES(threading)
+#endif
+}
+
 void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) {
   
   int j, k=l->num_lattice_site_var/2;
@@ -513,24 +597,66 @@ void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int st
   }
 }
 
-#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION
+void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) {
+  
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    int j, k=l->num_lattice_site_var/4;
+    vector_PRECISION eta_end;
+    
+    eta_end = eta+end;
+    phi += start;
+    eta += start;
+    
+    ASSERT( eta != phi );
+    while ( eta < eta_end ) {
+      phi += k;
+      for ( j=0; j<k; j++ ) {
+        *eta = -(*phi);
+        eta++; phi++;
+      }
+      phi -= 2*k;
+      for ( j=0; j<k; j++ ) {
+        *eta = -(*phi);
+        eta++; phi++;
+      }
+      phi += 2*k;
+      for ( j=0; j<k; j++ ) {
+        *eta = *phi;
+        eta++; phi++;
+      }
+      phi -= 2*k;
+      for ( j=0; j<k; j++ ) {
+        *eta = *phi;
+        eta++; phi++;
+      }
+      phi += k;
+    }
+  } else 
+#endif
+    {
+      warning0("coarse_tau1_gamma5_PRECISION called with g.n_flavours != 2\n");
+      coarse_gamma5_PRECISION( eta, phi, start, end, l );
+    }
+}
+
 void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
                                       level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _SC, threading );
-  START_LOCKED_MASTER(threading)
-  coarse_self_couplings_PRECISION( eta, phi, op->clover, l->inner_vector_size, l );
-#ifdef HAVE_TM
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    coarse_add_anti_block_diagonal_PRECISION( eta, phi, op->tm_term, l->inner_vector_size, l );
-#endif
-  END_LOCKED_MASTER(threading)
+  int start;
+  int end;
+  compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &end, l, threading, 1);
+
+  coarse_self_couplings_PRECISION( eta, phi, op, start, end, l);
+
   PROF_PRECISION_STOP( _SC, 1, threading );
   PROF_PRECISION_START( _NC, threading );
-  coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading );
+
+  coarse_pn_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, +1, l, threading );
+
   PROF_PRECISION_STOP( _NC, 1, threading );
 }
-#endif
 
 void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
                                           level_struct *l, struct Thread *threading ) {
@@ -557,7 +683,7 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
   if ( !l->idle ) {
     int vs = l->vector_size, ivs = l->inner_vector_size,
         cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size;
-    double diff;
+    PRECISION diff = 0;
     vector_PRECISION vp1=NULL, vp2, vp3, vp4, vc1=NULL, vc2, vc3;
 
     PUBLIC_MALLOC( vp1, complex_PRECISION, 4*vs );
@@ -568,133 +694,175 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
     vp2 = vp1 + vs; vp3 = vp2 + vs; vp4 = vp3 + vs; vc2 = vc1 + cvs; vc3 = vc2 + cvs; 
 
     START_LOCKED_MASTER(threading)
-#ifdef INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION
-    double norm = 0.0;
-    double dot = 0.0;
-    float *op = (float *)l->is_PRECISION.operator;
-    float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1;
-    for ( int i=0; i<l->inner_vector_size; i++ )
-      norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]);
-    for ( int i=0; i<l->inner_vector_size; i++ )
-      dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]);
-    diff = dot/norm;
+#ifdef HAVE_TM1p1
+    if(g.n_flavours == 1)
+#endif
+    {
+#ifdef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION
+      double norm = 0.0;
+      double dot = 0.0;
+      float *op = (float *)l->is_PRECISION.operator;
+      float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1;
+      for ( int i=0; i<l->inner_vector_size; i++ )
+        norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]);
+      for ( int i=0; i<l->inner_vector_size; i++ )
+        dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]);
+      diff = dot/norm;
 #else
-    diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading )
-         / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading );
+      diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading )
+        / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading );
 #endif
-    printf0("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) );
-    if(diff > g.test) g.test = diff;
+      test0_PRECISION("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) );
+    }
     
     if ( !l->next_level->idle )
-      vector_PRECISION_define_random( vc1, 0, civs, l->next_level );
+      vector_PRECISION_define_random( vc1, 0, civs, l->next_level, no_threading );
     vector_PRECISION_distribute( vc2, vc1, l->next_level );
     vector_PRECISION_gather( vc3, vc2, l->next_level );
     if ( !l->next_level->idle ) {
       vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level );
       diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
     }
-    printf0("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff );
-    if(diff > g.test) g.test = diff;
-    
+    test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff );
+        
     if ( !l->next_level->idle )
-      vector_PRECISION_define_random( vc1, 0, civs, l->next_level );
-    interpolate3_PRECISION( vp1, vc1, l, no_threading );
+      vector_PRECISION_define_random( vc1, 0, civs, l->next_level, no_threading );
+    vector_PRECISION_define_zero( vp1, 0, ivs, l, no_threading );
+    interpolate_PRECISION( vp1, vc1, l, no_threading );
     restrict_PRECISION( vc2, vp1, l, no_threading );
     if ( !l->next_level->idle ) {
       vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level );
       diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
-      printf0("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, diff );
-      if(diff > g.test) g.test = diff;
+      test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) );
     }    
-
-    END_LOCKED_MASTER(threading)
+      
+    END_LOCKED_MASTER(threading);
     if(threading->n_core>1) {
-      interpolate3_PRECISION( vp1, vc1, l, threading );
+      vector_PRECISION_define_zero( vp1, 0, ivs, l, threading );
+      interpolate_PRECISION( vp1, vc1, l, threading );
       restrict_PRECISION( vc2, vp1, l, threading );
       START_LOCKED_MASTER(threading)
       if ( !l->next_level->idle ) {
         vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level );
         diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
-        printf0("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff );
-	if(diff > g.test) g.test = diff;
+        test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff );
       }
       END_LOCKED_MASTER(threading)
     }
-#ifdef HAVE_TM
 
-    int tm_site_size = (l->next_level->num_lattice_site_var/2*(l->next_level->num_lattice_site_var/2+1));
-    config_PRECISION tm_term=NULL;
-    PUBLIC_MALLOC( tm_term, complex_PRECISION, tm_site_size*l->next_level->num_inner_lattice_sites );
+    START_LOCKED_MASTER(threading)
+    if (l->depth==0) 
+      gamma5_PRECISION( vp2, vp1, l, no_threading );
+    else
+      coarse_gamma5_PRECISION( vp2, vp1, 0, ivs, l );
+    restrict_PRECISION( vc2, vp2, l, no_threading );
+    coarse_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level );
+    if ( !l->next_level->idle ) {
+      vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level );
+      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff );
+    }    
+#ifdef HAVE_TM1p1
+    if(g.n_flavours == 2) {
+      if (l->depth==0) 
+        tau1_gamma5_PRECISION( vp2, vp1, l, no_threading );
+      else
+        coarse_tau1_gamma5_PRECISION( vp2, vp1, 0, ivs, l );
+      restrict_PRECISION( vc2, vp2, l, no_threading );
+      coarse_tau1_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level );
+      if ( !l->next_level->idle ) {
+        vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level );
+        diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+        test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff );
+      }    
+    }
+#endif
+    END_LOCKED_MASTER(threading)
 
     START_LOCKED_MASTER(threading)
-    if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) {
-      tm_term_PRECISION_setup( tm_term, l->next_level->s_PRECISION.op.odd_proj, l->next_level, no_threading );
+      vector_PRECISION_define_zero( vp2, 0, ivs, l, no_threading );
+    if (l->depth==0) 
+      add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs );
+    else
+      coarse_add_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l );
+    restrict_PRECISION( vc2, vp2, l, no_threading );
+    
+    vector_PRECISION_scale( vc2, vc2, -1.0, 0, civs, l->next_level, no_threading );
+    coarse_add_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level );
+    diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+    test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff );
+    END_LOCKED_MASTER(threading)  
+
+#ifdef HAVE_TM
+    START_LOCKED_MASTER(threading)
+    if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) {
+      vector_PRECISION_define_zero( vp2, 0, ivs, l, no_threading );
+      if (l->depth==0) 
+        add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs );
+      else
+        coarse_add_anti_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l );
+      restrict_PRECISION( vc2, vp2, l, no_threading );
       
-      vector_PRECISION_define( vc2, 0, 0, civs, l );
-      vector_PRECISION_define( vc3, 0, 0, civs, l );
+      vector_PRECISION_scale( vc2, vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level, no_threading );
       coarse_add_anti_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level );
-      coarse_add_anti_block_diagonal_PRECISION( vc3, vc1, tm_term, civs, l->next_level );
-      
-      vector_PRECISION_minus( vc3, vc3, vc2, 0, civs, l->next_level );
-      diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
-      printf0("depth: %d, correctness of building tm_term: %le\n", l->next_level->depth, diff );
-      if(diff > g.test) g.test = diff;      
+      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff );
     }
-    END_LOCKED_MASTER(threading)	
-
-    if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-        if(threading->n_core>1) {
-	  
-          tm_term_PRECISION_setup( tm_term, l->next_level->s_PRECISION.op.odd_proj, l->next_level, no_threading );
-	  
-          START_LOCKED_MASTER(threading)
-          vector_PRECISION_define( vc3, 0, 0, civs, l );
-	  coarse_add_anti_block_diagonal_PRECISION( vc3, vc1, tm_term, civs, l->next_level );
-	
-          vector_PRECISION_minus( vc3, vc3, vc2, 0, civs, l->next_level );
-          diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
-          printf0("depth: %d, correctness of building tm_term with threading: %le\n", l->next_level->depth, diff );
-	  if(diff > g.test) g.test = diff;
-	  END_LOCKED_MASTER(threading)	
-        }
-    
-    PUBLIC_FREE( tm_term, complex_PRECISION, tm_site_size*l->next_level->num_inner_lattice_sites );
+    END_LOCKED_MASTER(threading)  
+#endif
 
+#ifdef HAVE_TM1p1
+    START_LOCKED_MASTER(threading)
+    if ( g.n_flavours == 2 &&
+         ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) {
+      vector_PRECISION_define_zero( vp2, 0, ivs, l, no_threading );
+      if (l->depth==0) 
+        apply_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs );
+      else
+        coarse_add_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l );
+      restrict_PRECISION( vc2, vp2, l, no_threading );
+      
+      vector_PRECISION_scale( vc2, vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level, no_threading );
+      coarse_add_doublet_coupling_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level );
+      diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading );
+      test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff );
+    }
+    END_LOCKED_MASTER(threading)  
 #endif
-    
+
     if ( l->level > 0 ) {
       START_LOCKED_MASTER(threading)
-      interpolate3_PRECISION( vp1, vc1, l, no_threading );
-      apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );      
+      vector_PRECISION_define_zero( vp1, 0, ivs, l, no_threading );
+      interpolate_PRECISION( vp1, vc1, l, no_threading );
 
+      apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );      
+      
 #ifdef HAVE_TM
-      if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-	if (g.tm_mu_factor[l->depth] != g.tm_mu_factor[l->next_level->depth]) {	
-	  vector_PRECISION_scale( vp3, vp1, (g.tm_mu_factor[l->next_level->depth]/g.tm_mu_factor[l->depth])-1.,
-				  0, ivs, l->next_level );
-	  if(l->depth == 0)
-	    add_diagonal_PRECISION( vp2, vp3, l->s_PRECISION.op.tm_term, ivs );
-	  else
-	    coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->s_PRECISION.op.tm_term, ivs, l );
-	}
-#endif
-
+      if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+        if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) {  
+          vector_PRECISION_scale( vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l, no_threading );
+          if(l->depth == 0)
+            add_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs );
+          else
+            coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs, l );
+        }
+#endif      
       restrict_PRECISION( vc2, vp2, l, no_threading );
+
       if ( !l->next_level->idle ) {
         if ( l->level==1 && g.odd_even )
           coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, no_threading );
         else
           apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading );
-
+        
         vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level );
-        diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
+        diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
+
         if ( l->level==1 && g.odd_even ) {
-          printf0("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff );
-	  if(diff > g.test) g.test = diff;
+          test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff );
         } else {
-          printf0("depth: %d, correctness of ( P* D P - D_c ) phi_c: %le\n", l->depth, diff );
-	  if(diff > g.test) g.test = diff;
-	}
+          test0_PRECISION("depth: %d, correctness of ( P* D P - D_c ) phi_c: %le\n", l->depth, diff );
+        }
       }      
       END_LOCKED_MASTER(threading)
 
@@ -710,21 +878,19 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
           vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level );
           diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading );
           if ( l->level==1 && g.odd_even ) { //TODO: this test doesn't work without SSE!!
-            printf0("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff );
-	    if(diff > g.test) g.test = diff;
-	  } else {
-            printf0("depth: %d, correctness of ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff );
-	    if(diff > g.test) g.test = diff;
-	  }
-    }
+            test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff );
+          } else {
+            test0_PRECISION("depth: %d, correctness of ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff );
+          }
+        }
         END_LOCKED_MASTER(threading)
-      }
+        }
     }
     START_LOCKED_MASTER(threading)
 
-
+      /*
     if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) {
-      vector_PRECISION_define_random( vp1, 0, ivs, l );
+      vector_PRECISION_define_random( vp1, 0, ivs, l, no_threading );
       block_to_oddeven_PRECISION( vp4, vp1, l, no_threading );
       coarse_diag_ee_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading );
       coarse_diag_oo_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading );
@@ -733,8 +899,7 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
       apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );
       vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l );
       diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading );
-      printf0("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff );
-      if(diff > g.test) g.test = diff;
+      test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff );
      
       block_to_oddeven_PRECISION( vp4, vp1, l, no_threading );
       coarse_odd_even_PRECISION_test( vp3, vp4, l, no_threading );
@@ -742,10 +907,9 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr
       apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading );
       vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l );
       diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading );
-      printf0("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff );
-      if(diff > g.test) g.test = diff;
-      }
-    
+      test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff );
+    }
+      */
     FREE( vp1, complex_PRECISION, 4*vs );
     FREE( vc1, complex_PRECISION, 3*cvs );
     END_LOCKED_MASTER(threading)
diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h
index c730412..8b42e66 100644
--- a/src/coarse_operator_generic.h
+++ b/src/coarse_operator_generic.h
@@ -22,27 +22,29 @@
 #ifndef COARSE_OPERATOR_PRECISION_HEADER
   #define COARSE_OPERATOR_PRECISION_HEADER
 
-  #include "blas_vectorized.h"
+  #include "simd_blas_PRECISION.h"
 
   struct Thread;
   
   void coarse_operator_PRECISION_alloc( level_struct *l );
   void coarse_operator_PRECISION_free( level_struct *l );
+  void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l );
   void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l );
+  void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading );
   void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
-  void coarse_operator_PRECISION_set_couplings_clover( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   
   void set_coarse_self_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2,
                                            vector_PRECISION *V, const int n, level_struct *l );
   void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2,
                                                vector_PRECISION *V, const int mu, const int n, level_struct *l );
 
-  void coarse_self_couplings_PRECISION( vector_PRECISION eta, config_PRECISION clover,
-                                        vector_PRECISION phi, int length, level_struct *l );
   void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, 
                                                  config_PRECISION clover, int length, level_struct *l );
   
   void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l );
+  void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l );
   void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
                                         operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
@@ -59,12 +61,7 @@
   void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l );
 
   void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l );
-
-  void coarse_aggregate_anti_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l );
-
-  void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION block, int length, level_struct *l );
-
-  
+ 
   void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading );
   
   // eta += D*phi, D stored columnwise
@@ -124,8 +121,42 @@
     }
   }
 
+  // eta += D*phi, D hermitian and stored columnwise packed
+  static inline void pmvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
+                                    const vector_PRECISION phi, const register int n ) {
+    register int i, j, k;
+
+    eta[0] += D[0]*phi[0];
+    for ( i=1, k=1; i<n; i++ ) {
+      eta[i] += conj_PRECISION(D[k])*phi[0];
+      eta[0] += D[k]*phi[i]; k++;
+      for ( j=1; j<i; j++, k++ ) {
+        eta[j] += D[k]*phi[i];
+        eta[i] += conj_PRECISION(D[k])*phi[j];
+      }
+      eta[i] += D[k]*phi[i]; k++;
+    }
+  }
+
+  // eta += D*phi, D hermitian and stored columnwise packed
+  static inline void mmvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
+                                     const vector_PRECISION phi, const register int n ) {
+    register int i, j, k;
+
+    eta[0] -= D[0]*phi[0];
+    for ( i=1, k=1; i<n; i++ ) {
+      eta[i] -= conj_PRECISION(D[k])*phi[0];
+      eta[0] -= D[k]*phi[i]; k++;
+      for ( j=1; j<i; j++, k++ ) {
+        eta[j] -= D[k]*phi[i];
+        eta[i] -= conj_PRECISION(D[k])*phi[j];
+      }
+      eta[i] -= D[k]*phi[i]; k++;
+    }
+  }
+
   // eta += D*phi, D anti-hermitian and stored columnwise packed
-  static inline void amvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
+  static inline void pamvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
                                     const vector_PRECISION phi, const register int n ) {
     register int i, j, k;
 
@@ -140,122 +171,198 @@
       eta[i] += D[k]*phi[i]; k++;
     }
   }
-
-  
-  static inline void coarse_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-                                            config_PRECISION D, level_struct *l ) {
   
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
+  // eta -= D*phi, D anti-hermitian and stored columnwise packed
+  static inline void mamvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D,
+                                    const vector_PRECISION phi, const register int n ) {
+    register int i, j, k;
 
-    // A  
-    nmv_PRECISION( eta, D, phi, num_eig_vect );
-    // C
-    eta += num_eig_vect;
-    D += num_eig_vect2;
-    nmv_PRECISION( eta, D, phi, num_eig_vect );
-    // B
-    phi += num_eig_vect;
-    eta -= num_eig_vect;
-    D += num_eig_vect2;
-    nmv_PRECISION( eta, D, phi, num_eig_vect );
-    // D
-    eta += num_eig_vect;
-    D += num_eig_vect2;
-    nmv_PRECISION( eta, D, phi, num_eig_vect );
+    eta[0] -= D[0]*phi[0];
+    for ( i=1, k=1; i<n; i++ ) {
+      eta[i] += conj_PRECISION(D[k])*phi[0];
+      eta[0] -= D[k]*phi[i]; k++;
+      for ( j=1; j<i; j++, k++ ) {
+        eta[j] -= D[k]*phi[i];
+        eta[i] += conj_PRECISION(D[k])*phi[j];
+      }
+      eta[i] -= D[k]*phi[i]; k++;
+    }
   }
 
-
-  static inline void coarse_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-                                                     config_PRECISION D, level_struct *l ) {
+  static inline void coarse_self_couplings_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+                                                             config_PRECISION clover, int length, level_struct *l ) {
     
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
-
-    // A* 
-    nmvh_PRECISION( eta, D, phi, num_eig_vect );
-    // -C*
-    phi += num_eig_vect;
-    D += num_eig_vect2;
-    mvh_PRECISION( eta, D, phi, num_eig_vect );
-    // -B*
-    eta += num_eig_vect;
-    phi -= num_eig_vect;
-    D += num_eig_vect2;
-    mvh_PRECISION( eta, D, phi, num_eig_vect );
-    // D*
-    phi += num_eig_vect;
-    D += num_eig_vect2;
-    nmvh_PRECISION( eta, D, phi, num_eig_vect );
+    int site_var = l->num_lattice_site_var,
+      num_eig_vect = l->num_parent_eig_vect,
+      clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2,
+      clover_step_size2 = SQUARE(num_eig_vect);
+    config_PRECISION clover_pt = clover;
+    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    // U(x) = [ A B      , A=A*, D=D*, C = -B*
+    //          C D ]
+    // storage order: upper triangle of A, upper triangle of D, B, columnwise
+    // diagonal coupling
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      while ( phi_pt < phi_end_pt ) {
+        // A
+        mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect;//1
+        phi_pt += num_eig_vect;//1
+        mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        // D
+        eta_pt += num_eig_vect;//2
+        phi_pt += num_eig_vect;//2
+        clover_pt += clover_step_size1; 
+        mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect;//3
+        phi_pt += num_eig_vect;//3
+        mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        // C = -B*
+        eta_pt -= num_eig_vect;//2
+        phi_pt -= 3*num_eig_vect;//0
+        clover_pt += clover_step_size1;
+        nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect;//3
+        phi_pt += num_eig_vect;//1
+        nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        // B
+        eta_pt -= 3*num_eig_vect;//0
+        phi_pt += num_eig_vect;//2
+        mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect;//1
+        phi_pt += num_eig_vect;//3
+        mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        eta_pt += 3*num_eig_vect;//4
+        phi_pt += num_eig_vect;//4
+        clover_pt += clover_step_size2;
+      }
+    } else
+#endif
+      while ( phi_pt < phi_end_pt ) {
+        // A
+        mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        clover_pt += clover_step_size1; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        // D
+        mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        clover_pt += clover_step_size1; phi_pt -= num_eig_vect;
+        // C = -B*
+        nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        phi_pt += num_eig_vect; eta_pt -= num_eig_vect;
+        // B
+        mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect );
+        clover_pt += clover_step_size2; phi_pt += num_eig_vect; eta_pt += site_var;
+      }
   }
-  
-  static inline void coarse_n_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-                                              config_PRECISION D, level_struct *l ) {
-  
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
 
-    // A  
-    mv_PRECISION( eta, D, phi, num_eig_vect );
-    // C
-    eta += num_eig_vect;
-    D += num_eig_vect2;
-    mv_PRECISION( eta, D, phi, num_eig_vect );
-    // B
-    phi += num_eig_vect;
-    eta -= num_eig_vect;
-    D += num_eig_vect2;
-    mv_PRECISION( eta, D, phi, num_eig_vect );
-    // D
-    eta += num_eig_vect;
-    D += num_eig_vect2;
-    mv_PRECISION( eta, D, phi, num_eig_vect );
+  static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+                                                        config_PRECISION block, int length, level_struct *l ) {
+    
+    int num_eig_vect = l->num_parent_eig_vect,
+      block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
+    config_PRECISION block_pt = block;
+    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    // U(x) = [ A 0      , A=A*, D=D* diag. excluded
+    //          0 D ]
+    // storage order: upper triangle of A, upper triangle of D, columnwise
+    // diagonal coupling
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      while ( phi_pt < phi_end_pt ) {
+        // A
+        pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        mmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        // D
+        pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        mmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+      }
+    } else
+#endif
+      while ( phi_pt < phi_end_pt ) {
+        // A
+        pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        // D
+        pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+      }
   }
 
-  static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-                                                       config_PRECISION D, level_struct *l ) {
+  static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+                                                               config_PRECISION block, int length, level_struct *l ) {
     
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
-
-    // A* 
-    mvh_PRECISION( eta, D, phi, num_eig_vect );
-    // -C*
-    phi += num_eig_vect;
-    D += num_eig_vect2;
-    nmvh_PRECISION( eta, D, phi, num_eig_vect );
-    // -B*
-    eta += num_eig_vect;
-    phi -= num_eig_vect;
-    D += num_eig_vect2;
-    nmvh_PRECISION( eta, D, phi, num_eig_vect );
-    // D*
-    phi += num_eig_vect;
-    D += num_eig_vect2;
-    mvh_PRECISION( eta, D, phi, num_eig_vect );
+    int num_eig_vect = l->num_parent_eig_vect,
+      block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
+    config_PRECISION block_pt = block;
+    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    // U(x) = [ A 0      , A=-A*, D=-D* diag. excluded
+    //          0 D ]
+    // storage order: upper triangle of A, upper triangle of D, columnwise
+    // diagonal coupling
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      while ( phi_pt < phi_end_pt ) {
+        // A
+        pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        mamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        // D
+        pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        mamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+      }
+    } else
+#endif
+      while ( phi_pt < phi_end_pt ) {
+        // A
+        pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+        // D
+        pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect );
+        block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect;
+      }
   }
 
+  static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
+                                                          config_PRECISION block, int length, level_struct *l ) {
+    
+#ifdef HAVE_TM1p1
+    int num_eig_vect = l->num_parent_eig_vect,
+      block_step_size = (num_eig_vect * (num_eig_vect+1))/2;
+    config_PRECISION block_pt = block;
+    vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length;
+    // U(x) = [ 0 A      , A=-A*, D=-D* diag. excluded
+    //          D 0 ]
+    // storage order: upper triangle of A, upper triangle of D, columnwise
+    // diagonal coupling
+    
+    while ( phi_pt < phi_end_pt ) {
+      // A
+      pamvp_PRECISION( eta_pt, block_pt, phi_pt+num_eig_vect, num_eig_vect );
+      pamvp_PRECISION( eta_pt+num_eig_vect, block_pt, phi_pt, num_eig_vect );
+      block_pt += block_step_size; eta_pt += 2*num_eig_vect; phi_pt += 2*num_eig_vect;
+      // D
+      pamvp_PRECISION( eta_pt, block_pt, phi_pt+num_eig_vect, num_eig_vect );
+      pamvp_PRECISION( eta_pt+num_eig_vect, block_pt, phi_pt, num_eig_vect );
+      block_pt += block_step_size; eta_pt += 2*num_eig_vect; phi_pt += 2*num_eig_vect;
+    }
+#else
+    warning0("coarse_add_doublet_coupling_PRECISION called without HAVE_TM1p1 defined.\n");
+    return;
+#endif
+}
+
   static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, 
                                                      vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
     
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);
+    int num_eig_vect = l->num_parent_eig_vect,
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -281,8 +388,8 @@
   static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2,
                                                               vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
     
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);  
+    int num_eig_vect = l->num_parent_eig_vect,
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);  
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -309,8 +416,8 @@
   static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2,
                                                        vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
     
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);
+    int num_eig_vect = l->num_parent_eig_vect,
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
@@ -336,8 +443,8 @@
   static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2,
                                                                 vector_PRECISION phi, config_PRECISION D, level_struct *l ) {
     
-    int num_eig_vect = l->num_lattice_site_var/2,
-        num_eig_vect2 = SQUARE(l->num_lattice_site_var/2);  
+    int num_eig_vect = l->num_parent_eig_vect,
+        num_eig_vect2 = SQUARE(l->num_parent_eig_vect);  
     // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
     //             C D ]                        -B*  D* ]
     // storage order: A, C, B, D
diff --git a/src/data_generic.c b/src/data_generic.c
deleted file mode 100644
index 950c814..0000000
--- a/src/data_generic.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-// vector storage for PRECISION precision
-void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-    PROF_PRECISION_START( _SET );
-  if ( phi != NULL ) {
-    int i;
-    for ( i=start; i<end; i++ )
-      phi[i] = value;
-  } else {
-    error0("Error in \"vector_PRECISION_define\": pointer is null\n");
-  }
-  if(thread == 0 && start != end)
-    PROF_PRECISION_STOP( _SET, 1 );
-}
-
-
-void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-    PROF_PRECISION_START( _SET );
-  if ( phi != NULL ) {
-    int i;
-    for ( i=start; i<end; i++ )
-      phi[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I;
-  } else {
-    error0("Error in \"vector_PRECISION_define_random\": pointer is null\n");
-  }
-  if(thread == 0 && start != end)
-    PROF_PRECISION_STOP( _SET, 1 );
-}
diff --git a/src/data_generic.h b/src/data_generic.h
deleted file mode 100644
index b236ab4..0000000
--- a/src/data_generic.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef DATA_PRECISION_HEADER
-  #define DATA_PRECISION_HEADER
-  
-  void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l );
-  void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l );
-  
-#endif
diff --git a/src/data_layout.c b/src/data_layout.c
index 4cbfb25..754aef7 100644
--- a/src/data_layout.c
+++ b/src/data_layout.c
@@ -39,6 +39,79 @@ void data_layout_init( level_struct *l ) {
   l->schwarz_vector_size = 2*l->vector_size - l->inner_vector_size;
 }
 
+void data_layout_n_flavours( int nf, level_struct *l, struct Thread *threading ) {
+
+  ASSERT(nf>0);
+  ASSERT(l->depth == 0);
+
+#ifdef HAVE_TM1p1
+  ASSERT(nf<=2);
+  
+  if( g.n_flavours == nf )
+    return;
+  else
+    g.n_flavours = nf;
+
+  START_LOCKED_MASTER(threading)
+  struct level_struct *l_tmp = l;
+  
+  while(1) {
+    if(l_tmp->depth == 0)
+      l_tmp->num_lattice_site_var = nf * 12;
+    else
+      l_tmp->num_lattice_site_var = nf * 2 * l_tmp->num_parent_eig_vect;
+    
+    l_tmp->inner_vector_size = l_tmp->num_inner_lattice_sites * l_tmp->num_lattice_site_var;
+    
+    l_tmp->vector_size = l_tmp->num_lattice_sites * l_tmp->num_lattice_site_var;
+    l_tmp->schwarz_vector_size = 2*l_tmp->vector_size - l_tmp->inner_vector_size;
+
+    if(l_tmp->depth == 0) {
+      g.p.v_end = l_tmp->inner_vector_size;
+      g.p_MP.sp.v_end = l_tmp->inner_vector_size;
+      g.p_MP.dp.v_end = l_tmp->inner_vector_size;
+    }
+
+    if ( g.mixed_precision ) {
+      l_tmp->s_float.block_vector_size = l_tmp->s_float.num_block_sites*l_tmp->num_lattice_site_var;
+      l_tmp->p_float.v_end = l_tmp->inner_vector_size;
+      l_tmp->sp_float.v_end = l_tmp->inner_vector_size;
+      l_tmp->dummy_p_float.v_end = l_tmp->inner_vector_size;
+      if ( (g.method >= 4 && g.odd_even) || (!l_tmp->idle && l_tmp->level == 0 && g.odd_even) ) {
+        if ( l_tmp->level == 0 )
+          l_tmp->p_float.v_end = l_tmp->oe_op_float.num_even_sites*l_tmp->num_lattice_site_var;
+        else
+          l_tmp->sp_float.v_end = l_tmp->oe_op_float.num_even_sites*l_tmp->num_lattice_site_var;
+      }
+      
+    } else {
+      l_tmp->s_double.block_vector_size = l_tmp->s_double.num_block_sites*l_tmp->num_lattice_site_var;
+      l_tmp->p_double.v_end = l_tmp->inner_vector_size;
+      l_tmp->sp_double.v_end = l_tmp->inner_vector_size;
+      l_tmp->dummy_p_double.v_end = l_tmp->inner_vector_size;
+      if ( (g.method >= 4 && g.odd_even) || (!l_tmp->idle && l_tmp->level == 0 && g.odd_even) ) {
+        if ( l_tmp->level == 0 )
+          l_tmp->p_double.v_end = l_tmp->oe_op_double.num_even_sites*l_tmp->num_lattice_site_var;
+        else
+          l_tmp->sp_double.v_end = l_tmp->oe_op_double.num_even_sites*l_tmp->num_lattice_site_var;
+      } 
+    }
+    
+    if ( l->level == 0 || l_tmp->next_level == NULL )
+      break;
+
+    l_tmp = l_tmp->next_level;
+  }
+    
+  update_threading( no_threading, l);
+  END_LOCKED_MASTER(threading)
+
+  update_threading( threading, l);
+#else
+  ASSERT(nf==1);
+#endif
+
+}
 
 void define_eot( int *eot, int *N, level_struct *l ) {
   
diff --git a/src/data_layout.h b/src/data_layout.h
index 566fe51..d9c3e7a 100644
--- a/src/data_layout.h
+++ b/src/data_layout.h
@@ -23,6 +23,7 @@
   #define DATA_LAYOUT_HEADER
   
   void data_layout_init( level_struct *l );
+  void data_layout_n_flavours( int n, level_struct *l, struct Thread *threading );
   void define_eot( int *eot, int *N, level_struct *l );
   void define_eo_bt( int **bt, int *eot, int *n_ebs, int *n_obs, int *n_bs, int *N, level_struct *l );
   void define_nt_bt_tt( int *nt, int *backward_nt, int **bt, int *tt, int *it, int *dt, level_struct *l );
diff --git a/src/dirac.c b/src/dirac.c
index 2afb74a..fec73d5 100644
--- a/src/dirac.c
+++ b/src/dirac.c
@@ -23,35 +23,37 @@
 
 void compute_clover_term ( SU3_storage U, level_struct *l ) {
   int i, j, t, z, y, x, mu, nu;
+  operator_double_struct *op = &(g.op_double);
 
-#ifdef HAVE_TM
-
-  l->tm_shift = g.tm_mu;
-  l->tm_even_shift = g.tm_mu_even_shift;
-  l->tm_odd_shift = g.tm_mu_odd_shift; 
-
-  vector_double_define( g.op_double.odd_proj, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l );
-  vector_double_define( g.op_double.tm_term, I*(l->tm_shift + l->tm_even_shift),
-			0, l->inner_vector_size, l );
+  op->m0 = g.m0;
   
   for ( mu=0; mu<4; mu++ )  
-    g.op_double.oe_offset += (l->local_lattice[mu]*(g.my_coords[mu]/l->comm_offset[mu]))%2;
-  g.op_double.oe_offset = g.op_double.oe_offset%2;
-
+    op->oe_offset += (l->local_lattice[mu]*(g.my_coords[mu]/l->comm_offset[mu]))%2;
+  op->oe_offset = op->oe_offset%2;
+  
   for ( i=0,t=0; t<l->local_lattice[T]; t++ )
     for ( z=0; z<l->local_lattice[Z]; z++ )
       for ( y=0; y<l->local_lattice[Y]; y++ )
-	for ( x=0; x<l->local_lattice[X]; x++ ){
-	  if((t+z+y+x+g.op_double.oe_offset)%2) //odd
-	    for ( j=0; j<12; j++, i++){
-	      g.op_double.odd_proj[i]=1;
-	      g.op_double.tm_term[i]+=I*(l->tm_odd_shift - l->tm_even_shift);
-	    }
-	  else
-	    i+=12;
-	}
-  
-  gamma5_double( g.op_double.tm_term, g.op_double.tm_term, l, no_threading);
+        for ( x=0; x<l->local_lattice[X]; x++ ){
+          if((t+z+y+x+op->oe_offset)%2) { //odd
+            FOR12(op->odd_proj[i] = 1; i++;);
+          } else {
+            FOR12(op->odd_proj[i] = _COMPLEX_double_ZERO; i++;);
+          }
+        }
+  
+#ifdef HAVE_TM
+  if ( g.mu + g.mu_even_shift == 0 && g.mu + g.mu_odd_shift == 0 )
+    vector_double_define_zero( op->tm_term, 0, l->inner_vector_size, l, no_threading );
+  else
+    tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, op, l, no_threading );  
+#endif
+
+#ifdef HAVE_TM1p1
+  if ( g.epsbar == 0 && g.epsbar_ig5_even_shift == 0 && g.epsbar_ig5_odd_shift == 0 ) 
+    vector_double_define_zero( op->epsbar_term, 0, l->inner_vector_size, l, no_threading );
+  else
+    epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, op, l, no_threading );  
 #endif
 
   // generate clover term
@@ -63,7 +65,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) {
     
     j = 42*l->num_inner_lattice_sites;
     for ( i=0; i<j; i++ )
-      g.op_double.clover[i] = 0;
+      op->clover[i] = 0;
     i = 0;
     for ( t=1; t<l->local_lattice[T]+1; t++ )
       for ( z=1; z<l->local_lattice[Z]+1; z++ )
@@ -71,12 +73,12 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) {
           for ( x=1; x<l->local_lattice[X]+1; x++ ) {
             // diagonal including the shift
             for ( j=0; j<12; j++)
-              g.op_double.clover[42*i+j] = 4+l->dirac_shift;
+              op->clover[42*i+j] = 4+op->m0;
             
             for ( mu=0; mu<4; mu++ )
               for ( nu=mu+1; nu<4; nu++ ) {
                 Qdiff( Qstore, mu, nu, t, z, y, x, U );
-                set_clover( Qstore, mu, nu, i, g.op_double.clover );
+                set_clover( Qstore, mu, nu, i, op->clover );
               }
               i++;
           }
@@ -84,7 +86,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) {
     mat_free( &Qstore, 3 );
     spin_free( 4, 4 );
   } else {
-    vector_double_define( g.op_double.clover, 4+l->dirac_shift, 0, l->inner_vector_size, l );
+    vector_double_define_real( op->clover, 4+op->m0, 0, l->inner_vector_size, l, no_threading );
   }
 }
 
@@ -108,7 +110,7 @@ void dirac_setup( config_double hopp, level_struct *l ) {
   if ( g.print > 0 ) printf0("%s\n", CLIFFORD_BASIS );
   if ( g.bc == _ANTIPERIODIC ) printf0("antiperiodic in time");
   else if ( g.bc == _TWISTED ) printf0("twisted (%.2f, %.2f, %.2f, %.2f)", g.twisted_bc[0], 
-				       g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]);
+               g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]);
   else printf0("periodic in time");
   printf0(" boundary conditions\n");
 
@@ -124,36 +126,37 @@ void dirac_setup( config_double hopp, level_struct *l ) {
       if (t<ll[T]) phase[T] = 1; 
       else phase[T] = -1;
       for ( z=1; z<ll[Z]+1; z++ )
-	for ( y=1; y<ll[Y]+1; y++ )
-	  for ( x=1; x<ll[X]+1; x++ )
-	    for ( mu=0; mu<4; mu++ )
-	      for (j=0; j<9; j++, i++) {
-		g.op_double.D[i] = 0.5*phase[mu]*hopp[i];
-		U[t][z][y][x][mu][j] = phase[mu]*hopp[i];
-	      }
+        for ( y=1; y<ll[Y]+1; y++ )
+          for ( x=1; x<ll[X]+1; x++ )
+            for ( mu=0; mu<4; mu++ )
+              for (j=0; j<9; j++, i++) {
+                g.op_double.D[i] = 0.5*phase[mu]*hopp[i];
+                U[t][z][y][x][mu][j] = phase[mu]*hopp[i];
+              }
     }
   }
   else if( g.bc == _TWISTED && ( onb[T] || onb[Z] || onb[Y] || onb[X] )) {
-    warning0("Twisted boundary conditions not fully supported outside the library.");
+    //TODO
+    warning0("Twisted boundary conditions not supported outside the library.\n");
     for ( t=1, i=0; t<ll[T]+1; t++ ) {
       if ( !onb[T] || t<ll[T] || g.twisted_bc[T]==0) phase[T] = 1; 
       else phase[T] = cexp(I*g.twisted_bc[T]);
       for ( z=1; z<ll[Z]+1; z++ ) {
-	if ( !onb[Z] || z<ll[Z] || g.twisted_bc[Z]==0) phase[Z] = 1; 
-	else phase[Z] = cexp(I*g.twisted_bc[Z]);
-	for ( y=1; y<ll[Y]+1; y++ ) {
-	  if ( !onb[Y] || y<ll[Y] || g.twisted_bc[Y]==0) phase[Y] = 1; 
-	  else phase[Y] = cexp(-I*g.twisted_bc[Y]);
-	  for ( x=1; x<ll[X]+1; x++ ) {
-	    if ( !onb[X] || x<ll[X] || g.twisted_bc[X]==0) phase[X] = 1; 
-	    else phase[X] = cexp(-I*g.twisted_bc[X]);
-	    for ( mu=0; mu<4; mu++ ) 
-	      for (j=0; j<9; j++, i++) {
-		g.op_double.D[i] = 0.5*phase[mu]*hopp[i];
-		U[t][z][y][x][mu][j] = phase[mu]*hopp[i];
-	      }
-	  }
-	}
+        if ( !onb[Z] || z<ll[Z] || g.twisted_bc[Z]==0) phase[Z] = 1; 
+        else phase[Z] = cexp(I*g.twisted_bc[Z]);
+        for ( y=1; y<ll[Y]+1; y++ ) {
+          if ( !onb[Y] || y<ll[Y] || g.twisted_bc[Y]==0) phase[Y] = 1; 
+          else phase[Y] = cexp(-I*g.twisted_bc[Y]);
+          for ( x=1; x<ll[X]+1; x++ ) {
+            if ( !onb[X] || x<ll[X] || g.twisted_bc[X]==0) phase[X] = 1; 
+            else phase[X] = cexp(-I*g.twisted_bc[X]);
+            for ( mu=0; mu<4; mu++ ) 
+              for (j=0; j<9; j++, i++) {
+                g.op_double.D[i] = 0.5*phase[mu]*hopp[i];
+                U[t][z][y][x][mu][j] = phase[mu]*hopp[i];
+              }
+          }
+        }
       }
     }
   }
@@ -161,17 +164,17 @@ void dirac_setup( config_double hopp, level_struct *l ) {
   else
     for ( t=1, i=0; t<ll[T]+1; t++ )
       for ( z=1; z<ll[Z]+1; z++ )
-	for ( y=1; y<ll[Y]+1; y++ )
-	  for ( x=1; x<ll[X]+1; x++ )
-	    for ( mu=0; mu<4; mu++ )
-	      for (j=0; j<9; j++, i++) {
-		g.op_double.D[i] = 0.5*hopp[i];
-		U[t][z][y][x][mu][j] = hopp[i];
-	      }
+        for ( y=1; y<ll[Y]+1; y++ )
+          for ( x=1; x<ll[X]+1; x++ )
+            for ( mu=0; mu<4; mu++ )
+              for (j=0; j<9; j++, i++) {
+                g.op_double.D[i] = 0.5*hopp[i];
+                U[t][z][y][x][mu][j] = hopp[i];
+              }
   
   SU3_ghost_update( &U, l );
   if ( g.print > 0 ) printf0("Configuration stored...\n");
-
+  
   compute_clover_term( U, l );
   
   // calculate the plaquette
@@ -650,143 +653,151 @@ void define_odd_even_table( level_struct *l ) {
 }
 
 
-void scale_clover( operator_double_struct *op, double scale_even, double scale_odd, level_struct *l ) {
-  
-  int i, j, n = l->num_inner_lattice_sites, *odd_even_table = g.odd_even_table;
-  double factors[2];
-  config_double clover=op->clover, clover_pt;
-  
-  factors[_EVEN] = scale_even; factors[_ODD] = scale_odd;
-  
-  if ( g.csw != 0.0 ) {
-    for ( i=0; i<n; i++ ) {
-      clover_pt = clover+42*i;
-      for ( j=0; j<42; j++ )
-        clover_pt[j] *= factors[ odd_even_table[i] ];
-    }
+void m0_update( double m0, level_struct *l, struct Thread *threading ) {
+
+  if (l->depth == 0) {
+    m0_update_double( m0, &(g.op_double), l, threading );
+    m0_update_float( m0, &(g.op_float), l, threading );
   } else {
-    for ( i=0; i<n; i++ ) {
-      clover_pt = clover+12*i;
-      for ( j=0; j<12; j++ )
-        clover_pt[j] *= factors[ odd_even_table[i] ];
-    }
+    if ( g.mixed_precision )
+      m0_update_float( m0, &(l->op_float), l, threading );
+    else
+      m0_update_double( m0, &(l->op_double), l, threading );
   }
+  
+  if ( g.mixed_precision ) {
+      m0_update_float( m0, &(l->oe_op_float), l, threading );
+      m0_update_float( m0, &(l->s_float.op), l, threading );      
+  } else {
+      m0_update_double( m0, &(l->oe_op_double), l, threading );
+      m0_update_double( m0, &(l->s_double.op), l, threading );
+  }  
+
+  START_LOCKED_MASTER(threading)
+  if(g.print>0) printf0("depth: %d, kappa updated to %f \n", (l->depth), 0.5/(m0 + 4.));
+  END_LOCKED_MASTER(threading)
+  
+  if ( g.interpolation && l->level > 0 && l->next_level != NULL )
+    m0_update(m0, l->next_level, threading);
 }
 
-void shift_update( complex_double shift, level_struct *l, struct Thread *threading ) {
 
-  ASSERT(l->depth == 0);
-  shift_update_double( &(g.op_double), shift, l, threading );
-  shift_update_float( &(g.op_float), shift, l, threading );
-  shift_update_double( &(l->s_double.op), shift, l, threading );
-  shift_update_float( &(l->s_float.op), shift, l, threading );
+void tm_term_update( double mu, level_struct *l, struct Thread *threading ) {
 
-  if ( g.mixed_precision )
-    operator_updates_float( l, threading ); 
-  else
-    operator_updates_double( l, threading );
+#ifdef HAVE_TM
+  double factor = g.mu_factor[l->depth];
+  double even_shift = g.mu_even_shift, odd_shift = g.mu_odd_shift;
+    
+  if (l->depth == 0) { // we don't use the multiplicative factor here
+    tm_term_double_setup( mu, even_shift, odd_shift, &(g.op_double), l, threading ); 
+    tm_term_float_setup( mu, even_shift, odd_shift, &(g.op_float), l, threading );
+  } else {
+    if ( g.mixed_precision )
+      tm_term_float_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->op_float), l, threading );
+    else
+      tm_term_double_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->op_double), l, threading );
+  }
+  
+  if ( g.mixed_precision ) {
+      tm_term_float_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->oe_op_float), l, threading );
+      tm_term_float_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->s_float.op), l, threading );
+  } else {
+      tm_term_double_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->oe_op_double), l, threading );   
+      tm_term_double_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->s_double.op), l, threading );   
+  }
 
-  START_LOCKED_MASTER(threading)
-  l->dirac_shift = shift;
-  l->real_shift = creal(shift);
-  END_LOCKED_MASTER(threading)
+  START_MASTER(threading)
+  if(g.print>0) {
+    if( g.mu_even_shift == g.mu_odd_shift )
+      printf0("depth: %d, mu updated to %f \n", (l->depth), factor*(mu+even_shift));
+    else  
+      printf0("depth: %d, mu updated to %f on even sites and %f on odd sites \n", l->depth, factor*(mu+even_shift),
+              factor*(mu+odd_shift));
+  }
+  END_MASTER(threading)
 
-#ifdef DEBUG
-  test_routine( l, threading );
+  if ( g.interpolation && l->level > 0 && l->next_level != NULL )
+    tm_term_update( mu, l->next_level, threading );
 #endif
 }
 
-void optimized_shift_update( complex_double mass_shift, level_struct *l, struct Thread *threading ) {
-  
-  ASSERT(l->depth==0);
-  
-  if ( mass_shift !=  l->dirac_shift ) {
-    shift_update_double( &(g.op_double), mass_shift, l, threading );
-    shift_update_float( &(g.op_float), mass_shift, l, threading );
-    if(l->s_double.op.clover != NULL) 
-      shift_update_double( &(l->s_double.op), mass_shift, l, threading );
-    if ( l->s_float.op.clover != NULL )
-      shift_update_float( &(l->s_float.op), mass_shift, l, threading );
+void epsbar_term_update( level_struct *l, struct Thread *threading ) {
 
-    START_LOCKED_MASTER(threading)
-    l->dirac_shift = mass_shift;
-    l->real_shift = creal(mass_shift);
-    END_LOCKED_MASTER(threading)
-   }
+#ifdef HAVE_TM1p1
+  double factor = g.epsbar_factor[l->depth];
+  double epsbar = g.epsbar;
+  double even_shift = g.epsbar_ig5_even_shift, odd_shift = g.epsbar_ig5_odd_shift;
+    
+  if (l->depth == 0) {
+    epsbar_term_double_setup( epsbar, even_shift, odd_shift, &(g.op_double), l, threading ); 
+    epsbar_term_float_setup( epsbar, even_shift, odd_shift, &(g.op_float), l, threading );
+  } else {
+    if ( g.mixed_precision )
+      epsbar_term_float_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->op_float), l, threading );
+    else
+      epsbar_term_double_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->op_double), l, threading );
+  }
   
-#ifdef HAVE_TM
-  if ( l->tm_shift != g.tm_mu || l->tm_even_shift != g.tm_mu_even_shift ||
-       l->tm_odd_shift != g.tm_mu_odd_shift ) {
+  if ( g.mixed_precision ) {
+      epsbar_term_float_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->oe_op_float),l, threading );
+      epsbar_term_float_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->s_float.op), l, threading );
+  } else {
+      epsbar_term_double_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->oe_op_double),l, threading );
+      epsbar_term_double_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->s_double.op), l, threading );
+  }
 
-    START_MASTER(threading)
-    if( g.tm_mu_even_shift == g.tm_mu_odd_shift )
-      printf0("depth: %d, updating mu to %f \n", (l->depth), cimag(g.tm_mu+g.tm_mu_even_shift));
+  START_MASTER(threading)
+  if(g.print>0) {
+    if( even_shift == odd_shift )
+      printf0("depth: %d, epsbar term updated to %f + ig5 %f \n", l->depth, factor*epsbar, factor*even_shift);
     else  
-      printf0("depth: %d, updating mu to %f on even sites and %f on odd sites \n", l->depth, cimag(g.tm_mu+g.tm_mu_even_shift), cimag(g.tm_mu+g.tm_mu_even_shift));
-  
-    l->tm_shift = g.tm_mu;
-    l->tm_even_shift = g.tm_mu_even_shift;
-    l->tm_odd_shift = g.tm_mu_odd_shift; 
-    END_LOCKED_MASTER(threading)
-    
-    tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, l, threading ); 
-    tm_term_float_setup( g.op_float.tm_term, g.op_float.odd_proj, l, threading );
-    
-    if(l->s_double.op.tm_term != NULL) 
-      tm_term_double_setup( l->s_double.op.tm_term, l->s_double.op.odd_proj, l, threading ); 
-    
-    if ( l->s_float.op.tm_term != NULL )
-      tm_term_float_setup( l->s_float.op.tm_term, l->s_float.op.odd_proj, l, threading );
+      printf0("depth: %d, epsbar term updated to %f + ig5 %f on even sites and + ig5 %f on odd sites \n", l->depth,
+              factor*epsbar, factor*even_shift, factor*odd_shift);
   }
-#endif
-    
-  START_LOCKED_MASTER(threading)  
-  if(l->s_double.op.clover != NULL) {
-#ifdef OPTIMIZED_SELF_COUPLING_double
-    if ( g.csw != 0 ) {
-      double *clover_vectorized_pt = l->s_double.op.clover_vectorized;
-      config_double clover_pt = l->s_double.op.clover;
-      config_double tm_term_pt = l->s_double.op.tm_term;
-      for ( int i=0; i<l->num_inner_lattice_sites; i++ ) {
-	sse_set_clover_double( clover_vectorized_pt, clover_pt );
-	sse_add_diagonal_clover_double( clover_vectorized_pt, tm_term_pt );
-	clover_pt += 42;
-	tm_term_pt += 12;
-	clover_vectorized_pt += 144;
-      }
-    }
-#endif
-    if ( g.odd_even )
-      schwarz_double_oddeven_setup( &(l->s_double.op), l );
-  }  
+  END_MASTER(threading)
 
-  if ( l->s_float.op.clover != NULL ) {
-#ifdef OPTIMIZED_SELF_COUPLING_float
-    if ( g.csw != 0 ) {
-      config_double clover_pt = g.op_double.clover;
-      config_double tm_term_pt = g.op_double.tm_term;
-      for ( int i=0; i<l->num_inner_lattice_sites; i++ ) {
-	//we have to reorder the term, while in OPTIMIZED_SELF_COUPLING_double we use already reordered terms
-	float *clover_vectorized_pt = l->s_float.op.clover_vectorized + 144*l->s_float.op.translation_table[i];
-	sse_set_clover_float( clover_vectorized_pt, clover_pt );
-	sse_add_diagonal_clover_float( clover_vectorized_pt, tm_term_pt );
-	clover_pt += 42;
-	tm_term_pt += 12;
-      }
-    }
+  if ( g.interpolation && l->level > 0 && l->next_level != NULL )
+    epsbar_term_update( l->next_level, threading );
 #endif
-    if ( g.odd_even )
-      schwarz_float_oddeven_setup( &(l->s_float.op), l );
-  }  
-  END_LOCKED_MASTER(threading)
+}
 
-  if ( g.mixed_precision ) 
-    optimized_shift_update_float( mass_shift, l->next_level, threading );
-  else 
-    optimized_shift_update_double( mass_shift, l->next_level, threading );
+void finalize_operator_update( level_struct *l, struct Thread *threading ) {
+
+  if (l->depth == 0) {
+    START_LOCKED_MASTER(threading)  
+    if(l->s_double.op.clover != NULL) {
+      operator_double_set_self_couplings(  &(l->s_double.op), l );
+      if ( g.odd_even )
+        schwarz_double_oddeven_setup( &(l->s_double), l );
+    }  
+    
+    if ( l->s_float.op.clover != NULL ) {
+      operator_float_set_self_couplings(  &(l->s_float.op), l );
+      if ( g.odd_even )
+        schwarz_float_oddeven_setup( &(l->s_float), l );
+    }  
+    END_LOCKED_MASTER(threading)
+  } else {
+    SYNC_CORES(threading)
+    if ( g.mixed_precision ) {
+      if ( !l->idle && g.odd_even && ((g.method >= 4 && l->level > 0) || l->level == 0) )
+        coarse_oddeven_float_set_self_couplings( l, threading );
+      else
+        coarse_operator_float_set_self_couplings( &(l->s_float.op), l, threading );
+    } else {
+      if ( !l->idle && g.odd_even && ((g.method >= 4 && l->level > 0) || l->level == 0) )
+        coarse_oddeven_double_set_self_couplings( l, threading );
+      else
+        coarse_operator_double_set_self_couplings( &(l->s_double.op), l, threading );
+    }
+  }
 
+  if ( g.interpolation && l->level > 0 )
+    finalize_operator_update( l->next_level, threading );
+         
 #ifdef DEBUG
-  if ( l->depth == 0 )
+  if (l->depth == 0) 
     test_routine( l, threading );
 #endif
+
 }
diff --git a/src/dirac.h b/src/dirac.h
index f239c51..3b65c89 100644
--- a/src/dirac.h
+++ b/src/dirac.h
@@ -27,7 +27,7 @@ struct Thread;
   typedef complex_double ******SU3_storage;
 
   void compute_clover_term ( SU3_storage U, level_struct *l );
-void dirac_setup( config_double hopp, level_struct *l );
+  void dirac_setup( config_double hopp, level_struct *l );
   
   void SU3_storage_alloc( SU3_storage *U, level_struct *l );
   void SU3_storage_free( SU3_storage *U, level_struct *l );
@@ -47,8 +47,9 @@ void dirac_setup( config_double hopp, level_struct *l );
   void set_clover( complex_double *q_store, int mu, int nu, int index, config_double clover );
   
   void define_odd_even_table( level_struct *l );
-  void scale_clover( operator_double_struct *op, double scale_even, double scale_odd, level_struct *l );
-  void shift_update( complex_double shift, level_struct *l, struct Thread *threading );
-  void optimized_shift_update( complex_double mass_shift, level_struct *l, struct Thread *threading );
+  void m0_update( double m0, level_struct *l, struct Thread *threading );
+  void tm_term_update( double mu, level_struct *l, struct Thread *threading );
+  void epsbar_term_update( level_struct *l, struct Thread *threading );
+  void finalize_operator_update( level_struct *l, struct Thread *threading );
 
 #endif
diff --git a/src/dirac_generic.c b/src/dirac_generic.c
index 02c5e93..41de506 100644
--- a/src/dirac_generic.c
+++ b/src/dirac_generic.c
@@ -21,26 +21,149 @@
 
 #include "main.h"
 
-void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, int length,
+void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end,
                        level_struct *l, struct Thread *threading ) {
-  
-  vector_PRECISION eta_end = eta + length;
+
+  int nv = l->num_lattice_site_var;
+  vector_PRECISION lphi = phi+start, leta = eta+start;
+  vector_PRECISION leta_end = eta+end;
+
+#ifdef PROFILING
+  START_MASTER(threading)
+  PROF_PRECISION_START( _SC );
+  END_MASTER(threading)
+#endif
+
+#ifdef HAVE_TM
+  config_PRECISION tm_term = op->tm_term+(start/nv)*12;
+#endif
+
   if ( g.csw == 0.0 ) {
-    while ( eta < eta_end ) {
-      FOR12( *eta = (*phi)*(*clover); eta++; phi++; clover++; )
+
+    config_PRECISION clover = op->clover+(start/nv)*12;
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) 
+        while ( leta < leta_end ) {
+          FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+          clover -= 6;
+          tm_term -= 6;
+          FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+          FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+          clover -= 6;
+          tm_term -= 6;
+          FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+        }
+      else
+#endif
+        while ( leta < leta_end ) {
+          FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+          clover -= 6;
+          FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+          clover -= 6;
+          FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+        }
+    } else {
+#endif
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) {
+        while ( leta < leta_end )
+          FOR12( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; );
+      } else
+#endif
+        while ( leta < leta_end )
+          FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; );
+#ifdef HAVE_TM1p1
     }
+#endif
+
   } else {
-    START_MASTER(threading)
-    PROF_PRECISION_START( _SC );
-    END_MASTER(threading)
-    while ( eta < eta_end ) {
-      site_clover_PRECISION( eta, phi, clover );
-      eta+=12; phi+=12; clover+=42;
+
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+
+    config_PRECISION clover = op->clover+(start/nv)*42;
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) 
+        while ( leta < leta_end ) {
+          doublet_site_clover_PRECISION( leta, lphi, clover );
+          clover+=42;
+          FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          tm_term -= 6;
+          FOR6( *leta -=(*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          tm_term -= 6;
+          FOR6( *leta -= (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+        }
+      else
+#endif
+        while ( leta < leta_end ) {
+          doublet_site_clover_PRECISION( leta, lphi, clover );
+          leta+=24; lphi+=24;
+          clover+=42;
+        }
+    } else {
+#endif
+#ifdef HAVE_TM
+      if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) 
+        while ( leta < leta_end ) {
+          site_clover_PRECISION( leta, lphi, clover );
+          FOR12( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; );
+          clover+=42;
+        }
+      else
+#endif
+        while ( leta < leta_end ) {
+          site_clover_PRECISION( leta, lphi, clover );
+          leta+=12; lphi+=12;
+          clover+=42;
+        }
+#ifdef HAVE_TM1p1
     }
-    START_MASTER(threading)
-    PROF_PRECISION_STOP( _SC, 1 );
-    END_MASTER(threading)
+#endif
+
+#else
+
+#ifdef HAVE_TM1p1
+    PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized;
+#else
+    PRECISION *clover = op->clover_vectorized;
+#endif
+    clover += start*12;
+    while ( leta < leta_end ) { // tm_term included in the clover vectorized
+      site_clover_vectorized_PRECISION( (PRECISION*) leta, (PRECISION*) lphi, clover );
+      leta += 3*SIMD_LENGTH_PRECISION; lphi += 3*SIMD_LENGTH_PRECISION;
+      clover += 12*3*SIMD_LENGTH_PRECISION;
+    }
+    
+#endif
+    
   }
+
+#ifdef HAVE_TM1p1
+  config_PRECISION eps_term = op->epsbar_term+(start/nv)*12;  
+  lphi = phi+start, leta = eta+start;
+  if ( g.n_flavours == 2 &&
+       ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) )
+    while ( leta < leta_end ) { 
+      lphi += 6;
+      FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; )
+      lphi -= 12;
+      eps_term -= 6;
+      FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; )
+      lphi += 6;
+    }
+#endif
+
+  
+#ifdef PROFILING
+  START_MASTER(threading)
+  PROF_PRECISION_STOP( _SC, 1 );
+  END_MASTER(threading)
+#endif
+    
 }
 
 static void spin0and1_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) {
@@ -78,112 +201,315 @@ static void spin2and3_clover_PRECISION( vector_PRECISION eta, vector_PRECISION p
   }
 }
 
-#if !defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) && !defined(OPTIMIZED_SELF_COUPLING_PRECISION)
 void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
 
   START_UNTHREADED_FUNCTION(threading)
   
-  int i, n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table;
+  int n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var;
   vector_PRECISION lphi = phi+start, leta = eta+start;
-  config_PRECISION clover = (g.csw==0.0)?s->op.clover+start:s->op.clover+(start/12)*42;
-  int j, k, *ind;
-  complex_PRECISION buf1[25]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+6, *buf3=buf2+6, *buf4=buf3+6;
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D + (start/12)*36;
-  
+
   // clover term
-  clover_PRECISION( leta, lphi, clover, 12*n, l, no_threading ); 
-#ifdef HAVE_TM
-  config_PRECISION tm_term = s->op.tm_term+start;
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    add_diagonal_PRECISION( leta, lphi, tm_term, 12*n );
-#endif
-    
-  // inner block couplings
-  ind = index[T]; // T direction
-  for ( i=0; i<length[T]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
-    prn_T_PRECISION( buf1, lphi+12*k ); // (1+gamma_T) phi(x) + projection
-    prp_T_PRECISION( buf2, lphi+12*j ); // (1-gamma_T) phi(x+hat{T}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_T^dagger(x) (1+gamma_T) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_T^dagger(x) (1+gamma_T) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
-    pbn_su3_T_PRECISION( buf3, leta+12*j ); // eta(x+hat{T}) -= U_T(x)^dagger(x) (1+gamma_T) phi(x) + lift back
-    pbp_su3_T_PRECISION( buf4, leta+12*k ); // eta(x) -= U_T(x) (1-gamma_T) phi(x+hat{T}) + lift back
-  }
-  ind = index[Z]; // Z direction
-  for ( i=0; i<length[Z]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
-    prn_Z_PRECISION( buf1, lphi+12*k ); // (1+gamma_Z) phi(x) + projection
-    prp_Z_PRECISION( buf2, lphi+12*j ); // (1-gamma_Z) phi(x+hat{Z}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
-    pbn_su3_Z_PRECISION( buf3, leta+12*j ); // eta(x+hat{Z}) -= U_Z(x)^dagger(x) (1+gamma_Z) phi(x) + lift back
-    pbp_su3_Z_PRECISION( buf4, leta+12*k ); // eta(x) -= U_Z(x) (1-gamma_Z) phi(x+hat{Z}) + lift back
-  }
-  ind = index[Y]; // Y direction
-  for ( i=0; i<length[Y]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
-    prn_Y_PRECISION( buf1, lphi+12*k ); // (1+gamma_Y) phi(x) + projection
-    prp_Y_PRECISION( buf2, lphi+12*j ); // (1-gamma_Y) phi(x+hat{Y}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
-    pbn_su3_Y_PRECISION( buf3, leta+12*j ); // eta(x+hat{Y}) -= U_Y(x)^dagger(x) (1+gamma_Y) phi(x) + lift back
-    pbp_su3_Y_PRECISION( buf4, leta+12*k ); // eta(x) -= U_Y(x) (1-gamma_Y) phi(x+hat{Y}) + lift back
+  clover_PRECISION(eta, phi, &(s->op), start, start+nv*n, l, no_threading );
+
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // block operator vectorized just in the float environment
+  PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96;
+  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96;
+  for ( int mu=0; mu<4; mu++ ) {
+    block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor );
+    block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor );
   }
-  ind = index[X]; // X direction
-  for ( i=0; i<length[X]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
-    prn_X_PRECISION( buf1, lphi+12*k ); // (1+gamma_X) phi(x) + projection
-    prp_X_PRECISION( buf2, lphi+12*j ); // (1-gamma_X) phi(x+hat{X}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_X^dagger(x) (1+gamma_X) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_X^dagger(x) (1+gamma_X) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
-    pbn_su3_X_PRECISION( buf3, leta+12*j ); // eta(x+hat{X}) -= U_X(x)^dagger(x) (1+gamma_X) phi(x) + lift back
-    pbp_su3_X_PRECISION( buf4, leta+12*k ); // eta(x) -= U_X(x) (1-gamma_X) phi(x+hat{X}) + lift back
+#else
+  int i, j, k, *ind;
+  config_PRECISION D_pt;
+  config_PRECISION D = s->op.D + (start/nv)*36;
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    complex_PRECISION buf1[50]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+12, *buf3=buf2+12, *buf4=buf3+12;
+    // inner block couplings
+    ind = index[T]; // T direction
+    for ( i=0; i<length[T]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      dprn_T_PRECISION( buf1, lphi+24*k ); // (1+gamma_T) phi(x) + projection
+      dprp_T_PRECISION( buf2, lphi+24*j ); // (1-gamma_T) phi(x+hat{T}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_T^dagger(x) (1+gamma_T) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_T^dagger(x) (1+gamma_T) phi(x) - projected
+      mvmh_PRECISION( buf3+6, D_pt, buf1+6 ); // U_T^dagger(x) (1+gamma_T) phi(x) - projected
+      mvmh_PRECISION( buf3+9, D_pt, buf1+9 ); // U_T^dagger(x) (1+gamma_T) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );      // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 );     // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
+      mvm_PRECISION( buf4+6, D_pt, buf2+6 ); // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
+      mvm_PRECISION( buf4+9, D_pt, buf2+9 ); // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
+      dpbn_su3_T_PRECISION( buf3, leta+24*j ); // eta(x+hat{T}) -= U_T(x)^dagger(x) (1+gamma_T) phi(x) + lift back
+      dpbp_su3_T_PRECISION( buf4, leta+24*k ); // eta(x) -= U_T(x) (1-gamma_T) phi(x+hat{T}) + lift back
+    }
+    ind = index[Z]; // Z direction
+    for ( i=0; i<length[Z]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      dprn_Z_PRECISION( buf1, lphi+24*k ); // (1+gamma_Z) phi(x) + projection
+      dprp_Z_PRECISION( buf2, lphi+24*j ); // (1-gamma_Z) phi(x+hat{Z}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 );     // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
+      mvmh_PRECISION( buf3+6, D_pt, buf1+6 ); // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
+      mvmh_PRECISION( buf3+9, D_pt, buf1+9 ); // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 );     // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
+      mvm_PRECISION( buf4+6, D_pt, buf2+6 ); // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
+      mvm_PRECISION( buf4+9, D_pt, buf2+9 ); // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
+      dpbn_su3_Z_PRECISION( buf3, leta+24*j ); // eta(x+hat{Z}) -= U_Z(x)^dagger(x) (1+gamma_Z) phi(x) + lift back
+      dpbp_su3_Z_PRECISION( buf4, leta+24*k ); // eta(x) -= U_Z(x) (1-gamma_Z) phi(x+hat{Z}) + lift back
+    }
+    ind = index[Y]; // Y direction
+    for ( i=0; i<length[Y]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      dprn_Y_PRECISION( buf1, lphi+24*k ); // (1+gamma_Y) phi(x) + projection
+      dprp_Y_PRECISION( buf2, lphi+24*j ); // (1-gamma_Y) phi(x+hat{Y}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 );     // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
+      mvmh_PRECISION( buf3+6, D_pt, buf1+6 ); // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
+      mvmh_PRECISION( buf3+9, D_pt, buf1+9 ); // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 );     // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
+      mvm_PRECISION( buf4+6, D_pt, buf2+6 ); // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
+      mvm_PRECISION( buf4+9, D_pt, buf2+9 ); // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
+      dpbn_su3_Y_PRECISION( buf3, leta+24*j ); // eta(x+hat{Y}) -= U_Y(x)^dagger(x) (1+gamma_Y) phi(x) + lift back
+      dpbp_su3_Y_PRECISION( buf4, leta+24*k ); // eta(x) -= U_Y(x) (1-gamma_Y) phi(x+hat{Y}) + lift back
+    }
+    ind = index[X]; // X direction
+    for ( i=0; i<length[X]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      dprn_X_PRECISION( buf1, lphi+24*k ); // (1+gamma_X) phi(x) + projection
+      dprp_X_PRECISION( buf2, lphi+24*j ); // (1-gamma_X) phi(x+hat{X}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_X^dagger(x) (1+gamma_X) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 );     // U_X^dagger(x) (1+gamma_X) phi(x) - projected
+      mvmh_PRECISION( buf3+6, D_pt, buf1+6 ); // U_X^dagger(x) (1+gamma_X) phi(x) - projected
+      mvmh_PRECISION( buf3+9, D_pt, buf1+9 ); // U_X^dagger(x) (1+gamma_X) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 );     // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
+      mvm_PRECISION( buf4+6, D_pt, buf2+6 ); // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
+      mvm_PRECISION( buf4+9, D_pt, buf2+9 ); // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
+      dpbn_su3_X_PRECISION( buf3, leta+24*j ); // eta(x+hat{X}) -= U_X(x)^dagger(x) (1+gamma_X) phi(x) + lift back
+      dpbp_su3_X_PRECISION( buf4, leta+24*k ); // eta(x) -= U_X(x) (1-gamma_X) phi(x+hat{X}) + lift back
+    }    
+  } else {
+#endif   
+    complex_PRECISION buf1[25]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+6, *buf3=buf2+6, *buf4=buf3+6;
+    // inner block couplings
+    ind = index[T]; // T direction
+    for ( i=0; i<length[T]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      prn_T_PRECISION( buf1, lphi+12*k ); // (1+gamma_T) phi(x) + projection
+      prp_T_PRECISION( buf2, lphi+12*j ); // (1-gamma_T) phi(x+hat{T}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_T^dagger(x) (1+gamma_T) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_T^dagger(x) (1+gamma_T) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
+      pbn_su3_T_PRECISION( buf3, leta+12*j ); // eta(x+hat{T}) -= U_T(x)^dagger(x) (1+gamma_T) phi(x) + lift back
+      pbp_su3_T_PRECISION( buf4, leta+12*k ); // eta(x) -= U_T(x) (1-gamma_T) phi(x+hat{T}) + lift back
+    }
+    ind = index[Z]; // Z direction
+    for ( i=0; i<length[Z]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      prn_Z_PRECISION( buf1, lphi+12*k ); // (1+gamma_Z) phi(x) + projection
+      prp_Z_PRECISION( buf2, lphi+12*j ); // (1-gamma_Z) phi(x+hat{Z}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
+      pbn_su3_Z_PRECISION( buf3, leta+12*j ); // eta(x+hat{Z}) -= U_Z(x)^dagger(x) (1+gamma_Z) phi(x) + lift back
+      pbp_su3_Z_PRECISION( buf4, leta+12*k ); // eta(x) -= U_Z(x) (1-gamma_Z) phi(x+hat{Z}) + lift back
+    }
+    ind = index[Y]; // Y direction
+    for ( i=0; i<length[Y]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      prn_Y_PRECISION( buf1, lphi+12*k ); // (1+gamma_Y) phi(x) + projection
+      prp_Y_PRECISION( buf2, lphi+12*j ); // (1-gamma_Y) phi(x+hat{Y}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
+      pbn_su3_Y_PRECISION( buf3, leta+12*j ); // eta(x+hat{Y}) -= U_Y(x)^dagger(x) (1+gamma_Y) phi(x) + lift back
+      pbp_su3_Y_PRECISION( buf4, leta+12*k ); // eta(x) -= U_Y(x) (1-gamma_Y) phi(x+hat{Y}) + lift back
+    }
+    ind = index[X]; // X direction
+    for ( i=0; i<length[X]; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      prn_X_PRECISION( buf1, lphi+12*k ); // (1+gamma_X) phi(x) + projection
+      prp_X_PRECISION( buf2, lphi+12*j ); // (1-gamma_X) phi(x+hat{X}) + projection
+      mvmh_PRECISION( buf3, D_pt, buf1 );     // U_X^dagger(x) (1+gamma_X) phi(x) - projected
+      mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_X^dagger(x) (1+gamma_X) phi(x) - projected
+      mvm_PRECISION( buf4, D_pt, buf2 );     // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
+      mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
+      pbn_su3_X_PRECISION( buf3, leta+12*j ); // eta(x+hat{X}) -= U_X(x)^dagger(x) (1+gamma_X) phi(x) + lift back
+      pbp_su3_X_PRECISION( buf4, leta+12*k ); // eta(x) -= U_X(x) (1-gamma_X) phi(x+hat{X}) + lift back
+    }
+#ifdef HAVE_TM1p1
   }
+#endif
+#endif
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
-
 
-#if !defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) && !defined(OPTIMIZED_SELF_COUPLING_PRECISION)
 void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
-  int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end;
+  int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var;
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX };
+  complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX };
+#else
   int i, j, *nb_pt;
-  complex_PRECISION pbuf[6];
   vector_PRECISION phi_pt, eta_pt, end_pt;
   config_PRECISION D_pt;
-  compute_core_start_end(0, 12*n, &start, &end, l, threading );
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  config_PRECISION clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42;
-    
+#endif
+
+  compute_core_start_end(0, nv*n, &start, &end, l, threading );
+
   SYNC_MASTER_TO_ALL(threading)
 
-  // clover term
-  clover_PRECISION( leta, lphi, clover, end-start, l, threading ); 
-#ifdef HAVE_TM
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    add_diagonal_PRECISION( leta, lphi, op->tm_term+start, end-start );
-#endif
-    
+  clover_PRECISION(eta, phi, op, start, end, l, threading );
+
   START_MASTER(threading)
   PROF_PRECISION_START( _NC ); 
   END_MASTER(threading)
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    dprp_PRECISION( prn, phi, start, end );
+#else
+    complex_PRECISION pbuf[12];  
+    for ( i=start/2, phi_pt=phi+start; i<end/2; i+=12, phi_pt+=24 ) {
+      dprp_T_PRECISION( op->prnT+i, phi_pt );
+      dprp_Z_PRECISION( op->prnZ+i, phi_pt );
+      dprp_Y_PRECISION( op->prnY+i, phi_pt );
+      dprp_X_PRECISION( op->prnX+i, phi_pt );
+    }
+#endif
+    // start communication in negative direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
+    END_LOCKED_MASTER(threading) 
   
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    dprn_su3_PRECISION( prp, phi, op, neighbor, start, end );
+#else
+    // project plus dir and multiply with U dagger
+    for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_pt<end_pt; phi_pt+=24 ) {
+      // T dir
+      j = 12*(*nb_pt); nb_pt++;
+      dprn_T_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpT+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpT+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpT+j+9, D_pt, pbuf+9 ); D_pt += 9;
+      // Z dir
+      j = 12*(*nb_pt); nb_pt++;
+      dprn_Z_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpZ+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpZ+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpZ+j+9, D_pt, pbuf+9 ); D_pt += 9;
+      // Y dir
+      j = 12*(*nb_pt); nb_pt++;
+      dprn_Y_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpY+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpY+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpY+j+9, D_pt, pbuf+9 ); D_pt += 9;
+      // X dir
+      j = 12*(*nb_pt); nb_pt++;
+      dprn_X_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpX+j, D_pt, pbuf );
+      mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpX+j+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9;
+    }
+#endif
+
+    // start communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
+    // wait for communication in negative direction
+    ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
+    END_LOCKED_MASTER(threading)
+     
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    su3_dpbp_PRECISION( eta, prn, op, neighbor, start, end );
+#else 
+    // multiply with U and lift up minus dir
+    for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_pt<end_pt; eta_pt+=24 ) {
+      // T dir
+      j = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnT+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnT+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnT+j+9 );
+      dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Z dir
+      j = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnZ+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnZ+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnZ+j+9 );
+      dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Y dir
+      j = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnY+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnY+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnY+j+9 );
+      dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // X dir
+      j = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnX+j );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnX+j+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 );
+      dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
+    }
+#endif
+
+    // wait for communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
+    ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
+    END_LOCKED_MASTER(threading)
+      
+    // lift up plus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    dpbn_PRECISION( eta, prp, start, end );
+#else
+    for ( i=start/2, eta_pt=eta+start; i<end/2; i+=12, eta_pt+=24 ) {
+      dpbn_su3_T_PRECISION( op->prpT+i, eta_pt );
+      dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
+      dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
+      dpbn_su3_X_PRECISION( op->prpX+i, eta_pt );
+    }
+#endif
+  } else {
+#endif
+
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  prp_PRECISION( prn, phi, start, end );
+#else
+  complex_PRECISION pbuf[6];
   for ( i=start/2, phi_pt=phi+start; i<end/2; i+=6, phi_pt+=12 ) {
     prp_T_PRECISION( op->prnT+i, phi_pt );
     prp_Z_PRECISION( op->prnZ+i, phi_pt );
     prp_Y_PRECISION( op->prnY+i, phi_pt );
     prp_X_PRECISION( op->prnX+i, phi_pt );
   }
+#endif
   // start communication in negative direction
   START_LOCKED_MASTER(threading)
   ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
@@ -193,6 +519,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
   END_LOCKED_MASTER(threading) 
   
   // project plus dir and multiply with U dagger
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  prn_su3_PRECISION( prp, phi, op, neighbor, start, end );
+#else
   for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_pt<end_pt; phi_pt+=12 ) {
     // T dir
     j = 6*(*nb_pt); nb_pt++;
@@ -215,6 +544,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     mvmh_PRECISION( op->prpX+j, D_pt, pbuf );
     mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9;
   }
+#endif
   
   // start communication in positive direction
   START_LOCKED_MASTER(threading)
@@ -230,6 +560,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
   END_LOCKED_MASTER(threading)
   
   // multiply with U and lift up minus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  su3_pbp_PRECISION( eta, prn, op, neighbor, start, end );
+#else
   for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_pt<end_pt; eta_pt+=12 ) {
     // T dir
     j = 6*(*nb_pt); nb_pt++;
@@ -252,6 +585,7 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
     mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 );
     pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
   }
+#endif
   
   // wait for communication in positive direction
   START_LOCKED_MASTER(threading)
@@ -262,40 +596,394 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat
   END_LOCKED_MASTER(threading)
   
   // lift up plus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  pbn_PRECISION( eta, prp, start, end );
+#else
   for ( i=start/2, eta_pt=eta+start; i<end/2; i+=6, eta_pt+=12 ) {
     pbn_su3_T_PRECISION( op->prpT+i, eta_pt );
     pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
     pbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
     pbn_su3_X_PRECISION( op->prpX+i, eta_pt );
   }
-  
+#endif
+#ifdef HAVE_TM1p1
+  }
+#endif
+
   START_MASTER(threading)
   PROF_PRECISION_STOP( _NC, 1 );
   END_MASTER(threading)
   
   SYNC_MASTER_TO_ALL(threading)
 }
-#endif
-
-
-void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {  
-  gamma5_PRECISION( l->vbuf_PRECISION[6], phi, l, threading );
-  d_plus_clover_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], op, l, threading );
-  gamma5_PRECISION( eta, l->vbuf_PRECISION[7], l, threading );
-}
 
 
 void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
   
+  ASSERT(l->depth == 0);
+
   vector_PRECISION eta_end = eta + threading->end_index[l->depth];
   eta += threading->start_index[l->depth];
   phi += threading->start_index[l->depth];
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    while ( eta < eta_end ) {
+      FOR12( *eta = -(*phi); phi++; eta++; )
+      FOR12( *eta =  (*phi); phi++; eta++; )
+    }
+  } else
+#endif
   while ( eta < eta_end ) {
     FOR6( *eta = -(*phi); phi++; eta++; )
     FOR6( *eta =  (*phi); phi++; eta++; )
   }
 }
 
+void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+  
+  ASSERT(l->depth == 0);
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+    complex_PRECISION b[6];
+    eta += threading->start_index[l->depth];
+    phi += threading->start_index[l->depth];
+    while ( eta < eta_end ) {
+      int i = 0;
+      FOR6( b[i] =  (*phi); phi++; i++;   );
+      FOR6( *eta = -(*phi); phi++; eta++; );
+      i = 0;
+      FOR6( *eta = - b[i] ; eta++; i++;   );
+      i = 0;
+      FOR6( b[i] =  (*phi); phi++; i++;   );
+      FOR6( *eta =  (*phi); phi++; eta++; );
+      i = 0;
+      FOR6( *eta =   b[i] ; eta++; i++;   );
+    }
+  } else 
+#endif
+    {
+      START_MASTER(threading)
+      warning0("tau1_gamma5_PRECISION called with g.n_flavours != 2\n");
+      END_MASTER(threading)
+      gamma5_PRECISION( eta, phi, l, threading );
+    }
+}
+
+void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+
+  ASSERT(l->depth == 0);
+  
+  int i = threading->start_site[l->depth];
+  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+  eta += threading->start_index[l->depth];
+  phi += threading->start_index[l->depth];
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 )
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_ODD) {
+        FOR24( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_EVEN) {
+        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+      }
+      i++;
+    }
+  else
+#endif
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_ODD) {
+        FOR12( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_EVEN) {
+        FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+      }
+      i++;
+    }
+}
+
+void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+
+  ASSERT(l->depth == 0);
+  
+  int i = threading->start_site[l->depth];
+  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+  eta += threading->start_index[l->depth];
+  phi += threading->start_index[l->depth];
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 )
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_ODD){
+        FOR12( *eta = -(*phi); phi++; eta++; );
+        FOR12( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_EVEN){
+        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+      }
+      i++;
+    }
+  else
+#endif
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_ODD){
+        FOR6( *eta = -(*phi); phi++; eta++; );
+        FOR6( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_EVEN){
+        FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+      }
+      i++;
+    }
+}
+
+void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+
+  ASSERT(l->depth == 0);
+  
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    int i = threading->start_site[l->depth];
+    vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+    eta += threading->start_index[l->depth];
+    phi += threading->start_index[l->depth];
+
+    complex_PRECISION b[6];
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_ODD){
+        int i = 0;
+        FOR6( b[i] =  (*phi); phi++; i++;   );
+        FOR6( *eta = -(*phi); phi++; eta++; );
+        i = 0;
+        FOR6( *eta = - b[i] ; eta++; i++;   );
+        i = 0;
+        FOR6( b[i] =  (*phi); phi++; i++;   );
+        FOR6( *eta =  (*phi); phi++; eta++; );
+        i = 0;
+        FOR6( *eta =   b[i] ; eta++; i++;   );
+      } else if(g.odd_even_table[i]==_EVEN){
+        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+      }
+      i++;
+    }
+  } else 
+#endif
+    {
+      START_MASTER(threading)
+      warning0("tau1_gamma5_set_even_to_zero_PRECISION called with g.n_flavours != 2\n");
+      END_MASTER(threading)
+      gamma5_set_even_to_zero_PRECISION( eta, phi, l, threading );
+    }
+}
+
+void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+   
+  int i = threading->start_site[l->depth];
+  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+  eta += threading->start_index[l->depth];
+  phi += threading->start_index[l->depth];
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 )
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN){
+        FOR24( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_ODD){
+        FOR24( *eta = 0; phi++; eta++; );
+      }
+      i++;
+    }
+  else
+#endif
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN) {
+        FOR12( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_ODD) {
+        FOR12( *eta = 0; phi++; eta++; );
+      }
+      i++;
+    }
+}
+
+void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+  
+  int i = threading->start_site[l->depth];
+  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+  eta += threading->start_index[l->depth];
+  phi += threading->start_index[l->depth];
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 )
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN){
+        FOR12( *eta = -(*phi); phi++; eta++; );
+        FOR12( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_ODD){
+        FOR24( *eta = 0; phi++; eta++; );
+      }
+      i++;
+    }
+  else
+#endif
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN){
+        FOR6( *eta = -(*phi); phi++; eta++; );
+        FOR6( *eta = (*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_ODD){
+        FOR12( *eta = 0; phi++; eta++; );
+      }
+      i++;
+    }
+}
+
+void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
+
+  ASSERT(l->depth == 0);
+  
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    int i = threading->start_site[l->depth];
+    vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+    eta += threading->start_index[l->depth];
+    phi += threading->start_index[l->depth];
+    
+    complex_PRECISION b[6];
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN){
+        int i = 0;
+        FOR6( b[i] =  (*phi); phi++; i++;   );
+        FOR6( *eta = -(*phi); phi++; eta++; );
+        i = 0;
+        FOR6( *eta = - b[i] ; eta++; i++;   );
+        i = 0;
+        FOR6( b[i] =  (*phi); phi++; i++;   );
+        FOR6( *eta =  (*phi); phi++; eta++; );
+        i = 0;
+        FOR6( *eta =   b[i] ; eta++; i++;   );
+      } else if(g.odd_even_table[i]==_ODD){
+        FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; );
+      }
+      i++;
+    }
+  } else 
+#endif
+    {
+      START_MASTER(threading)
+      warning0("tau1_gamma5_set_odd_to_zero_PRECISION called with g.n_flavours != 2\n");
+      END_MASTER(threading)
+      gamma5_set_odd_to_zero_PRECISION( eta, phi, l, threading );
+    }
+}
+
+void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, 
+                               level_struct *l, struct Thread *threading ) {
+   
+  int i = threading->start_site[l->depth];
+  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
+  eta += threading->start_index[l->depth];
+  phi += threading->start_index[l->depth];
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 )
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN){
+        FOR24( *eta = even*(*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_ODD){
+        FOR24( *eta = odd*(*phi); phi++; eta++; );
+      }
+      i++;
+    }
+  else
+#endif
+    while ( eta < eta_end ) {
+      if(g.odd_even_table[i]==_EVEN) {
+        FOR12( *eta = even*(*phi); phi++; eta++; );
+      }
+      else if(g.odd_even_table[i]==_ODD) {
+        FOR12( *eta = odd*(*phi); phi++; eta++; );
+      }
+      i++;
+    }
+}
+
+
+void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) {
+
+#ifdef HAVE_TM1p1
+
+  /*
+   * Order: spin0and1 of flav1
+   *        spin0and1 of flav2
+   *        spin2and3 of flav1
+   *        spin2and3 of flav2
+   */
+  vector_PRECISION serial_end;
+  
+  if( g.n_flavours == 2 ) {
+    serial_end = serial + threading->end_index[l->depth];
+    serial += threading->start_index[l->depth];
+    flav1 += threading->start_index[l->depth]/2;
+    flav2 += threading->start_index[l->depth]/2;
+  }
+  else {
+    serial_end = serial + threading->end_index[l->depth]*2;
+    serial += threading->start_index[l->depth]*2;
+    flav1 += threading->start_index[l->depth];
+    flav2 += threading->start_index[l->depth];
+  }
+
+  while ( serial < serial_end ) {
+    FOR6( *serial = (*flav1); serial++; flav1++; )
+    FOR6( *serial = (*flav2); serial++; flav2++; )
+    FOR6( *serial = (*flav1); serial++; flav1++; )
+    FOR6( *serial = (*flav2); serial++; flav2++; )
+  }
+#else
+  START_MASTER(threading)
+  warning0("two_flavours_to_serial_PRECISION called without HAVE_TM1p1 defined\n");
+  END_MASTER(threading)
+#endif
+    
+}
+
+void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) {
+
+#ifdef HAVE_TM1p1
+  vector_PRECISION serial_end;
+  
+  if( g.n_flavours == 2 ) {
+    serial_end = serial + threading->end_index[l->depth];
+    serial += threading->start_index[l->depth];
+    flav1 += threading->start_index[l->depth]/2;
+    flav2 += threading->start_index[l->depth]/2;
+  }
+  else {
+    serial_end = serial + threading->end_index[l->depth]*2;
+    serial += threading->start_index[l->depth]*2;
+    flav1 += threading->start_index[l->depth];
+    flav2 += threading->start_index[l->depth];
+  }
+
+  while ( serial < serial_end ) {
+    FOR6( *flav1 = (*serial); serial++; flav1++; )
+    FOR6( *flav2 = (*serial); serial++; flav2++; )
+    FOR6( *flav1 = (*serial); serial++; flav1++; )
+    FOR6( *flav2 = (*serial); serial++; flav2++; )
+  }
+#else
+  START_MASTER(threading)
+  warning0("two_flavours_to_serial_PRECISION called without HAVE_TM1p1 defined\n");
+  END_MASTER(threading)
+#endif
+    
+}
 
 void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   d_plus_clover_PRECISION( eta, phi, op, l, threading );
@@ -304,14 +992,105 @@ void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, oper
   SYNC_CORES(threading)
 }
 
+void set_clover_vectorized_PRECISION( operator_PRECISION_struct *op, level_struct *l, Thread *threading ) {
+
+#define real_index( i, j ) ((i)/SIMD_LENGTH_PRECISION)*12*SIMD_LENGTH_PRECISION + SIMD_LENGTH_PRECISION*(j)*2 + (i)%SIMD_LENGTH_PRECISION
+#define imag_index( i, j ) ((i)/SIMD_LENGTH_PRECISION)*12*SIMD_LENGTH_PRECISION + SIMD_LENGTH_PRECISION*((j)*2+1) + (i)%SIMD_LENGTH_PRECISION
+
+  int clover_size = 42;
+  config_PRECISION clover_pt = op->clover;
+  PRECISION *clover_v_pt = op->clover_vectorized;
+#ifdef HAVE_TM
+  config_PRECISION tm_term_pt = op->tm_term;
+#endif
+#ifdef HAVE_TM1p1
+  PRECISION *clover_doublet_v_pt = op->clover_doublet_vectorized;
+#endif
+  int start, end;
+  // ASSUMPTION: SIMD_LENGTH_PRECISION power of 2.
+  compute_core_start_end_custom( 0, l->num_inner_lattice_sites, &start, &end, l, threading, (SIMD_LENGTH_PRECISION<4) ? 1:(SIMD_LENGTH_PRECISION/4));
+
+  int index;
+  PRECISION sign = 0.0;
+  for ( int i=start*12; i<end*12; i+=SIMD_LENGTH_PRECISION ) {
+    int n = i/12;
+    int i12 = i%12;
+    for ( int j=0; j<6; j++ ) {
+      for ( int k=0; k<SIMD_LENGTH_PRECISION; k++ ) {
+        index = (i12+k);
+        if ( index > 12 ) index = index % 12;
+        if ( index == j || index-6 == j ) {
+          // diagonal entry i+k,i+k
+          index = n*clover_size + index;
+          sign = 1.0;
+        } else if ( index < 6 ) {
+          // first 6-by-6 matrix
+          if ( j > index ) {
+            // upper triangle
+            index = n*clover_size + 12 + ( 30 - (5-index)*(6-index) )/2 + (j-(index+1));
+            sign = 1.0;
+          } else {
+            // lower triangle, j < i+k
+            index = n*clover_size + 12 + ( 30 - (5-(j))*(6-(j)) )/2 + (index-(j+1));
+            sign = -1.0;
+          } 
+        } else {
+          // i+k >= 6
+          // second 6-by-6 matrix
+          index = index - 6;
+          if ( j > index ) {
+            // upper triangle
+            index = n*clover_size + 12 + 15 + ( 30 - (5-index)*(6-index) )/2 + (j-(index+1));
+            sign = 1.0;
+          } else {
+            // j < i+k-6
+            // lower triangle
+            index = n*clover_size + 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + (index-(j+1));
+            sign = -1.0;
+          }
+        }
+        PRECISION c_re = creal_PRECISION( clover_pt[index] );
+        PRECISION c_im = sign*cimag_PRECISION( clover_pt[index] );
+#ifdef HAVE_TM
+        if ((i+k)%6 == j) {
+          // add tm_term to diagonal
+          c_re += creal_PRECISION( tm_term_pt[i+k] );
+          c_im += cimag_PRECISION( tm_term_pt[i+k] );
+        }
+#endif
+        clover_v_pt[ real_index(i+k,j) ] = c_re;
+        clover_v_pt[ imag_index(i+k,j) ] = c_im;
+#ifdef HAVE_TM1p1
+        int d = ( (i+k)%12 < 6 ) ? 0:6;
+        clover_doublet_v_pt[ real_index(12*n+i+k+d,j) ] = c_re;
+        clover_doublet_v_pt[ imag_index(12*n+i+k+d,j) ] = c_im;
+#ifdef HAVE_TM
+        if ((i+k)%6 == j) {
+          // change sign to tm_term on diagonal
+          c_re -= 2*creal_PRECISION( tm_term_pt[i+k] );
+          c_im -= 2*cimag_PRECISION( tm_term_pt[i+k] );
+        }
+#endif
+        clover_doublet_v_pt[ real_index(12*n+i+k+d+6,j) ] = c_re;
+        clover_doublet_v_pt[ imag_index(12*n+i+k+d+6,j) ] = c_im;
+#endif
+      }
+    }
+  }
+
+#undef real_index
+#undef imag_index
+
+}
+
 
 void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ) {
 
   vector_PRECISION eta_end = eta1 + l->inner_vector_size;
   
   while ( eta1 < eta_end ) {
-    FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; )
-    FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; )
+    FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; );
+    FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; );
   }
 }
 
@@ -405,7 +1184,6 @@ void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION
   }
 }
 
-
 void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) {
   
   int i, length, index1, index2, *index_dir, *neighbor;
@@ -468,7 +1246,7 @@ void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta
     }
   }
 }
-
+ 
 void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l) {
   int t, z, y, x, i;
   int *gl=l->global_lattice, sl[4];
@@ -482,12 +1260,17 @@ void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISIO
     for (z=0; z<l->local_lattice[1]; z++) {
       phase[Z] = phase[T] + theta[Z]*((double)sl[Z]+z)/(double)gl[Z];
       for (y=0; y<l->local_lattice[2]; y++) {
-	phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y];
+        phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y];
         for (x=0; x<l->local_lattice[3]; x++) {
-	  phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X];
-	  twisted_bc = exp(I*phase[X]);
-	  FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; );
-	}
+          phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X];
+          twisted_bc = exp(I*phase[X]);
+#ifdef HAVE_TM1p1
+          if( g.n_flavours == 2 ) {
+            FOR24( *eta = (*phi)*twisted_bc; phi++; eta++; );
+          } else
+#endif
+            { FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; ) }
+        }
       }
     }
   }
@@ -497,20 +1280,12 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) {
 
   if ( l->level > 0 ) {
     if ( !l->idle ) {
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
+#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
       coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
       START_LOCKED_MASTER(threading)
 #else
       START_LOCKED_MASTER(threading)
       coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
-#endif
-#ifdef HAVE_TM
-      l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth];
-      l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth];
-      l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth];
-      
-      if( g.tm_mu_factor[l->next_level->depth]!=g.tm_mu_factor[l->depth] )
-	tm_term_PRECISION_setup( l->next_level->op_PRECISION.tm_term, l->next_level->op_PRECISION.odd_proj, l->next_level, no_threading );
 #endif
       conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level );
       END_LOCKED_MASTER(threading)
@@ -519,13 +1294,13 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) {
         schwarz_PRECISION_boundary_update( &(l->next_level->s_PRECISION), l->next_level );
         END_LOCKED_MASTER(threading)
         if ( g.method >= 4 && g.odd_even ) {
-          coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
+          coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
         } else {
           coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
         }
       }
       if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) {
-        coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
+        coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
       } else if ( !l->next_level->idle && l->next_level->level == 0 ) {
         coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
       }
@@ -535,7 +1310,7 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) {
 }
 
 
-void shift_update_PRECISION( operator_PRECISION_struct *op, complex_PRECISION shift, level_struct *l, struct Thread *threading ) {
+void m0_update_PRECISION( PRECISION m0, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
   // no hyperthreading in this function
   if(threading->thread != 0)
@@ -543,181 +1318,308 @@ void shift_update_PRECISION( operator_PRECISION_struct *op, complex_PRECISION sh
 
   config_PRECISION clover = op->clover;
   
-  if ( clover != NULL ) {
+  if ( clover != NULL && op->m0 != m0 ) {
     int i, j;
-    complex_PRECISION old_shift = (complex_PRECISION) l->dirac_shift;
-    complex_PRECISION shift_diff = shift - old_shift;
-    
-    if ( l->depth == 0 ) {
-      int start = threading->start_site[l->depth];
-      int n     = threading->n_site[l->depth];
-      clover += start*(g.csw?42:12);
-      for ( i=0; i<n; i++ ) {
-        for ( j=0; j<12; j++ ) {
-          clover[j] += shift_diff;
-        }
-        // clover term diag also stored as complex, so size is 2*15+2*6 = 42
-        clover += (g.csw?42:12);
-      }
-    } else {
-      int start = threading->start_site[l->depth];
-      int n     = threading->n_site[l->depth];
-      int k = l->num_lattice_site_var/2;
-      int sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1);
-      clover += start*sc_size;
-      for ( i=0; i<n; i++ ) {
-        for ( j=0; j<k; j++ ) {
-          if ( j>0 ) clover += j+1; 
-          *clover += shift_diff;
+    complex_PRECISION m0_diff = m0 - op->m0;
+
+    START_MASTER(threading)
+    op->m0 = m0;
+    END_MASTER(threading)
+
+    if( m0_diff != 0 ) {
+      if ( l->depth == 0 ) {
+        int start = threading->start_site[l->depth];
+        int n     = threading->n_site[l->depth];
+        clover += start*(g.csw?42:12);
+        for ( i=0; i<n; i++ ) {
+          for ( j=0; j<12; j++ ) {
+            clover[j] += m0_diff;
+          }
+          // clover term diag also stored as complex, so size is 2*15+2*6 = 42
+          clover += (g.csw?42:12);
         }
-        clover ++;
-        for ( j=0; j<k; j++ ) {
-          if ( j>0 ) clover += j+1;
-          *clover += shift_diff;
+      } else {
+        int start = threading->start_site[l->depth];
+        int n     = threading->n_site[l->depth];
+        int k = l->num_parent_eig_vect;
+        int sc_size = (l->num_parent_eig_vect)*(l->num_parent_eig_vect*2+1);
+        clover += start*sc_size;
+        for ( i=0; i<n; i++ ) {
+          for ( j=0; j<k; j++ ) {
+            if ( j>0 ) clover += j+1; 
+            *clover += m0_diff;
+          }
+          clover ++;
+          for ( j=0; j<k; j++ ) {
+            if ( j>0 ) clover += j+1;
+            *clover += m0_diff;
+          }
+          clover += 1 + SQUARE(k);
         }
-        clover += 1 + SQUARE(k);
       }
     }
   }
 }
 
-void tm_term_PRECISION_setup( config_PRECISION tm_term, config_PRECISION odd_proj, level_struct *l, struct Thread *threading ) {
+void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op,
+                              level_struct *l, struct Thread *threading ) {
    
 #ifdef HAVE_TM
   if(threading->thread != 0)
     return;
 
-  complex_PRECISION shift = I*l->tm_shift;
-  complex_PRECISION even_shift = I*l->tm_even_shift;
-  complex_PRECISION odd_shift = I*l->tm_odd_shift;
-
+  config_PRECISION tm_term = op->tm_term;
   if ( tm_term != NULL ) {
+    config_PRECISION odd_proj = op->odd_proj;
+    complex_PRECISION shift = I*mu;
+    complex_PRECISION even_shift = I*even;
+    complex_PRECISION odd_shift = I*odd;
+
+    START_MASTER(threading)
+    op->mu = mu;
+    op->mu_even_shift = even;
+    op->mu_odd_shift = odd;
+    END_MASTER(threading)
+
     int i, j;
     int start, end;
     compute_core_start_end(0, l->num_inner_lattice_sites, &start, &end, l, threading);
     int n = end-start;
-    complex_PRECISION tm_shift;
           
     if ( l->depth == 0 ) {
+      complex_PRECISION tm_shift;
       tm_term += start*12;
       odd_proj += start*12;
       
       for ( i=0; i<n; i++ ) {
-	if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. )
-	  tm_shift = shift;
-	else
-	  tm_shift = shift + even_shift + odd_proj[0]*(odd_shift - even_shift);
-        for ( j=0; j<6; j++ ) {
-          tm_term[j] = - tm_shift;
-        }
-        for ( j=6; j<12; j++ ) {
-          tm_term[j] = tm_shift;
-        }
-	tm_term += 12;
-	odd_proj += 12;
+        if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. )
+          tm_shift = shift;
+        else
+          tm_shift = shift + even_shift + odd_proj[0]*(odd_shift - even_shift);
+        FOR6( *tm_term = - tm_shift; tm_term++; )
+        FOR6( *tm_term = tm_shift; tm_term++; )
+        odd_proj += 12;
       }
     } else {
-      int k, m  = l->num_lattice_site_var/2;
-      int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-            
+      int k, m  = l->num_parent_eig_vect;
+      int tm_size = m*(m+1);
+      
       tm_term += start*tm_size;
       odd_proj += start*tm_size;
-
+      
       if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. ) {
-	
-	tm_shift = shift;
-	
-	for ( i=0; i<n; i++ ) {
-	  for ( j=0; j<m; j++ ) {
-	    for ( k=0; k<j; k++ )
-	      tm_term[k] = _COMPLEX_PRECISION_ZERO;
-	    tm_term += j;
-	    *tm_term = -1.* tm_shift;
-	    tm_term++;
-	  }
-	  
-	  for ( j=0; j<m; j++ ) {
-	    for ( k=0; k<j; k++ )
-	      tm_term[k] = _COMPLEX_PRECISION_ZERO;
-	    tm_term += j;
-	    *tm_term = tm_shift;
-	    tm_term++;
-	  }
-	}
+        
+        for ( i=0; i<n; i++ ) {
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ )
+              tm_term[k] = _COMPLEX_PRECISION_ZERO;
+            tm_term += j;
+            *tm_term = -1.* shift;
+            tm_term++;
+          }
+          
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ )
+              tm_term[k] = _COMPLEX_PRECISION_ZERO;
+            tm_term += j;
+            *tm_term = shift;
+            tm_term++;
+          }
+        }
       } else {
-	complex_PRECISION odd_factor = odd_shift - even_shift;
-	tm_shift = shift + even_shift;
-	
-	for ( i=0; i<n; i++ ) {
-	  for ( j=0; j<m; j++ ) {
-	    for ( k=0; k<j; k++ ) 
-	      tm_term[k] = -1.* odd_factor*odd_proj[k] ;
-	    tm_term += j;
-	    odd_proj += j;
-	    *tm_term = -1.* (tm_shift + odd_factor * (*odd_proj));
-	    tm_term++;
-	    odd_proj++;
-	  } 
+        complex_PRECISION odd_factor = odd_shift - even_shift;
         
-	  for ( j=0; j<m; j++ ) {
-	    for ( k=0; k<j; k++ ) 
-	      tm_term[k] = odd_factor*odd_proj[k] ;
-	    tm_term += j;
-	    odd_proj += j;
-	    *tm_term = (tm_shift + odd_factor * (*odd_proj));
-	    tm_term++;
-	    odd_proj++;
-	  } 
-	}
+        for ( i=0; i<n; i++ ) {
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ ) 
+              tm_term[k] = -1. * odd_factor * odd_proj[k] ;
+            tm_term += j;
+            odd_proj += j;
+            *tm_term = -1.* ( shift + even_shift + odd_factor * (*odd_proj));
+            tm_term++;
+            odd_proj++;
+          } 
+          
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ ) 
+              tm_term[k] = odd_factor * odd_proj[k] ;
+            tm_term += j;
+            odd_proj += j;
+            *tm_term = ( shift + even_shift + odd_factor * (*odd_proj));
+            tm_term++;
+            odd_proj++;
+          } 
+        }
       }
-    }
+    }  
   }
 #endif
 }
 
-void optimized_shift_update_PRECISION( complex_PRECISION mass_shift, level_struct *l, struct Thread *threading ) {
+void epsbar_term_PRECISION_setup( PRECISION epsbar, PRECISION even, PRECISION odd, operator_PRECISION_struct *op,
+                                  level_struct *l, struct Thread *threading ) {
+  
+#ifdef HAVE_TM1p1
+  if(threading->thread != 0)
+    return;
 
-  if ( !l->idle ) {
+  config_PRECISION eps_term = op->epsbar_term;
+  if ( eps_term != NULL ) {
+    config_PRECISION odd_proj = op->odd_proj;
+    complex_PRECISION shift = -epsbar;
+    complex_PRECISION even_shift = I*even;
+    complex_PRECISION odd_shift = I*odd;
 
-    if ( mass_shift !=  l->dirac_shift ) {
-      shift_update_PRECISION( &(l->op_PRECISION), mass_shift, l, threading );
-      shift_update_PRECISION( &(l->s_PRECISION.op), mass_shift, l, threading );
-      START_LOCKED_MASTER(threading)
-      l->dirac_shift = mass_shift;
-      l->real_shift = creal(mass_shift);
-      END_LOCKED_MASTER(threading)
-    }
-    
-#ifdef HAVE_TM
-    if ( l->tm_shift != g.tm_mu*g.tm_mu_factor[l->depth] ||
-	 l->tm_even_shift != g.tm_mu_even_shift*g.tm_mu_factor[l->depth] ||
-	 l->tm_odd_shift != g.tm_mu_odd_shift*g.tm_mu_factor[l->depth] ) {
-      START_LOCKED_MASTER(threading)
-      if( g.tm_mu_even_shift == g.tm_mu_odd_shift )
-	  printf0("depth: %d, updating mu to %f \n", (l->depth), cimag(g.tm_mu+g.tm_mu_even_shift));
-      else  
-	printf0("depth: %d, updating mu to %f on even sites and %f on odd sites \n", l->depth, cimag(g.tm_mu+g.tm_mu_even_shift), cimag(g.tm_mu+g.tm_mu_even_shift));
+    START_MASTER(threading)
+    op->epsbar = epsbar;
+    op->epsbar_ig5_even_shift = even;
+    op->epsbar_ig5_odd_shift = odd;
+    END_MASTER(threading)
+
+    int i, j;
+    int start, end;
+    compute_core_start_end(0, l->num_inner_lattice_sites, &start, &end, l, threading);
+    int n = end-start;
+          
+    if ( l->depth == 0 ) {
+      eps_term += start*12;
+      odd_proj += start*12;
+
+      if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. )
+        for ( i=0; i<n; i++ ) {
+          FOR12( *eps_term = shift; eps_term++; );
+        }
+      else
+        for ( i=0; i<n; i++ ) {
+          complex_PRECISION ig5_shift = even_shift + (*odd_proj)*(odd_shift - even_shift);
+          FOR6( *eps_term = shift-ig5_shift; eps_term++; );
+          FOR6( *eps_term = shift+ig5_shift; eps_term++; );
+          odd_proj += 12;
+      }
+    } else {
+      int k, m  = l->num_parent_eig_vect;
+      int eps_size = m*(m+1);
       
-      l->tm_shift = g.tm_mu*g.tm_mu_factor[l->depth];
-      l->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->depth];
-      l->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->depth];
-      END_LOCKED_MASTER(threading)
-	
-      tm_term_PRECISION_setup( l->op_PRECISION.tm_term, l->op_PRECISION.odd_proj, l, threading ); 
-      tm_term_PRECISION_setup( l->s_PRECISION.op.tm_term, l->s_PRECISION.op.odd_proj, l, threading );
-    }
+      eps_term += start*eps_size;
+      odd_proj += start*eps_size;
+      
+      if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. ) {
+        for ( i=0; i<2*n; i++ ) {
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ )
+              eps_term[k] = _COMPLEX_PRECISION_ZERO;
+            eps_term += j;
+            *eps_term = shift;
+            eps_term++;
+          }
+        } 
+      } else {
+        complex_PRECISION odd_factor = odd_shift - even_shift;
+         
+        for ( i=0; i<n; i++ ) {
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ ) 
+              eps_term[k] = -1.* odd_factor*odd_proj[k] ;
+            eps_term += j;
+            odd_proj += j;
+            *eps_term = shift - (even_shift + odd_factor * (*odd_proj));
+            eps_term++;
+            odd_proj++;
+          } 
+          
+          for ( j=0; j<m; j++ ) {
+            for ( k=0; k<j; k++ ) 
+              eps_term[k] = odd_factor*odd_proj[k] ;
+            eps_term += j;
+            odd_proj += j;
+            *eps_term = shift + (even_shift + odd_factor * (*odd_proj));
+            eps_term++;
+            odd_proj++;
+          } 
+        }
+      }
+    }  
+  }
 #endif
+}
+
+
+void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
+
+#ifdef HAVE_TM1p1
+  double diff;
+  
+  vector_double vd1=NULL, vd2, vd3, vd4, vdd1, vdd2, vdd3, vdd4;
+  vector_PRECISION vpp1=NULL, vpp2;
+
+  ASSERT(g.n_flavours==2);
+
+  data_layout_n_flavours( 1, l, threading );
+
+  int ivs = l->inner_vector_size;
   
+  PUBLIC_MALLOC( vd1, complex_double, 4*ivs + 2*4*ivs );
+  PUBLIC_MALLOC( vpp1, complex_PRECISION, 2*2*ivs );
+
+  vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs;
+  vdd1 = vd4 + ivs; vdd2 = vdd1 + 2*ivs; vdd3 = vdd2 + 2*ivs; vdd4 = vdd3 + 2*ivs;
+  vpp2 = vpp1 + 2*ivs;
+  
+  START_LOCKED_MASTER(threading)
+
+  vector_double_define_random( vd1, 0, l->inner_vector_size, l, no_threading );
+  vector_double_define_random( vd2, 0, l->inner_vector_size, l, no_threading );
+  apply_operator_double( vd3, vd1, &(g.p), l, no_threading );
+#ifdef HAVE_TM
+  vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); 
+#endif
+  apply_operator_double( vd4, vd2, &(g.p), l, no_threading );
+#ifdef HAVE_TM
+  vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); 
+#endif
+  add_diagonal_double( vd3, vd2, g.op_double.epsbar_term, l->inner_vector_size );
+  add_diagonal_double( vd4, vd1, g.op_double.epsbar_term, l->inner_vector_size );
+
+  two_flavours_to_serial_double( vd1, vd2, vdd1, l, no_threading );
+  two_flavours_to_serial_double( vd3, vd4, vdd2, l, no_threading );
+
+  data_layout_n_flavours( 2, l, threading );
+
+  trans_PRECISION( vpp1, vdd1, op->translation_table, l, no_threading );
+  apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, no_threading );
+  trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, no_threading );
+  
+  vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l );
+  diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) /
+    global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading );
+  
+  test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION: %le\n", l->depth, diff );
+  END_LOCKED_MASTER(threading)
+
+  if(threading->n_core > 1) {
+    trans_PRECISION( vpp1, vdd1, op->translation_table, l, threading );
+    apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, threading );
+    trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, threading );
+    
+    SYNC_MASTER_TO_ALL(threading)
     SYNC_CORES(threading)
 
-    if ( !l->idle && g.method >= 4 && l->level > 0 && g.odd_even ) 
-      coarse_oddeven_re_setup_PRECISION( &(l->s_PRECISION.op), _REORDER, l, threading );
-    else if ( !l->idle && l->level == 0 && g.odd_even)
-      coarse_oddeven_re_setup_PRECISION( &(l->s_PRECISION.op), _NO_REORDERING, l, threading );
-    else
-      coarse_operator_PRECISION_set_couplings_clover( &(l->s_PRECISION.op), l, threading );
-   
-    if(l->level > 0)
-      optimized_shift_update_PRECISION( mass_shift, l->next_level, threading );
-  }
+    START_LOCKED_MASTER(threading)
+    vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l );
+    diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) /
+      global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading );
+    
+    test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION with threading: %le\n", l->depth, diff );
+    END_LOCKED_MASTER(threading)
+  }    
+  
+  PUBLIC_FREE( vd1, complex_double, 4*ivs + 2*4*ivs );
+  PUBLIC_FREE( vpp1, complex_PRECISION, 2*2*ivs );
+
+  START_LOCKED_MASTER(threading)
+  if ( g.method >=4 && g.odd_even )
+    oddeven_PRECISION_test( l );
+  END_LOCKED_MASTER(threading) 
+#endif
+    
 }
diff --git a/src/dirac_generic.h b/src/dirac_generic.h
index 79bf51a..f88ae7e 100644
--- a/src/dirac_generic.h
+++ b/src/dirac_generic.h
@@ -24,46 +24,77 @@
 
   struct Thread;
   
-  void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  
-  void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, int length,
-                         level_struct *l, struct Thread *threading );
+  void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading );
+  void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading );
+
+
+  void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading );
   
   void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
+  void set_clover_vectorized_PRECISION( operator_PRECISION_struct *op, level_struct *l, Thread *threading );
+
   void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l );
   void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l );
   void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l );
   void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l);
   void operator_updates_PRECISION( level_struct *l, struct Thread *threading );
-  void shift_update_PRECISION( operator_PRECISION_struct *op, complex_PRECISION shift, level_struct *l, struct Thread *threading );
-  void tm_term_PRECISION_setup( config_PRECISION tm_term, config_PRECISION odd_proj, level_struct *l, struct Thread *threading );
-  void optimized_shift_update_PRECISION( complex_PRECISION mass_shift, level_struct *l, struct Thread *threading );
+  void m0_update_PRECISION( PRECISION m0,operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+  void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op,
+                              level_struct *l, struct Thread *threading );
+  void epsbar_term_PRECISION_setup( PRECISION epsbar, PRECISION even, PRECISION odd, operator_PRECISION_struct *op,
+                                    level_struct *l, struct Thread *threading );
+  void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
+
+  void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd,
+                                 level_struct *l, struct Thread *threading );
+
 
   static inline void add_diagonal_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi,
-				     const config_PRECISION diag, const int length ) {
+             const config_PRECISION diag, const int length ) {
+    config_PRECISION diag_pt = diag;
+    vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length;
+#ifdef HAVE_TM1p1
+    if(g.n_flavours == 2)
+      while ( eta_pt < eta_end ) {
+        FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+        diag_pt -= 6;
+        FOR6( *eta_pt -= (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+        FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+        diag_pt -= 6;
+        FOR6( *eta_pt -= (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+      }
+     else
+#endif
+       while ( eta_pt < eta_end ) 
+         FOR12( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+  }
+
+#ifdef HAVE_TM1p1
+  static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi,
+             const config_PRECISION diag, const int length ) {
     config_PRECISION diag_pt = diag;
     vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length;
     while ( eta_pt < eta_end ) { 
-      FOR12( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+      phi_pt += 6;
+      FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+      phi_pt -= 12;
+      diag_pt -= 6;
+      FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; )
+      phi_pt += 6;
     }
   }
-  static inline void zero12_PRECISION( const vector_PRECISION phi ) {
-    phi[ 0] = _COMPLEX_PRECISION_ZERO;
-    phi[ 1] = _COMPLEX_PRECISION_ZERO;
-    phi[ 2] = _COMPLEX_PRECISION_ZERO;
-    phi[ 3] = _COMPLEX_PRECISION_ZERO;
-    phi[ 4] = _COMPLEX_PRECISION_ZERO;
-    phi[ 5] = _COMPLEX_PRECISION_ZERO;
-    phi[ 6] = _COMPLEX_PRECISION_ZERO;
-    phi[ 7] = _COMPLEX_PRECISION_ZERO;
-    phi[ 8] = _COMPLEX_PRECISION_ZERO;
-    phi[ 9] = _COMPLEX_PRECISION_ZERO;
-    phi[10] = _COMPLEX_PRECISION_ZERO;
-    phi[11] = _COMPLEX_PRECISION_ZERO;
-  }
+#endif
 
   // eta = D*phi
   static inline void mvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) {
@@ -117,6 +148,96 @@
     eta[2] -=   conj_PRECISION(D[8])*phi[2];
   }
 
+/*
+  // 1 +/- gamma_mu
+  static inline void pr_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt, const int mu, const int sign ) {
+    prp_pt[0] = l_pt[0] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+0];
+    prp_pt[1] = l_pt[1] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+1];
+    prp_pt[2] = l_pt[2] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+2];
+    prp_pt[3] = l_pt[3] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+0];
+    prp_pt[4] = l_pt[4] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+1];
+    prp_pt[5] = l_pt[5] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+2];
+  }
+
+  // 1 +/- gamma_mu
+  static inline void pr_doublet_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt, const int mu, const int sign ) {
+    prp_pt[ 0] = l_pt[ 0] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+0];
+    prp_pt[ 1] = l_pt[ 1] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+1];
+    prp_pt[ 2] = l_pt[ 2] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+2];
+    prp_pt[ 3] = l_pt[ 3] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+0];
+    prp_pt[ 4] = l_pt[ 4] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+1];
+    prp_pt[ 5] = l_pt[ 5] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+2];
+    prp_pt[ 6] = l_pt[ 6] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+6];
+    prp_pt[ 7] = l_pt[ 7] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+7];
+    prp_pt[ 8] = l_pt[ 8] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+8];
+    prp_pt[ 9] = l_pt[ 9] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+6];
+    prp_pt[10] = l_pt[10] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+7];
+    prp_pt[11] = l_pt[11] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+8];
+  }
+
+static inline void project_PRECISION( complex_double *pr[4], complex_double *phi, int start, int end, level_struct *l ) {
+  int site_var = l->num_lattice_site_var;
+  complex_double *phi_pt = phi+start*site_var;
+  complex_double *phi_end = phi+end*site_var;
+  complex_double *pr_pt[4] = {pr[0]+start*site_var/2, pr[1]+start*site_var/2, pr[2]+start*site_var/2, prn[3]+start*site_var/2};
+
+#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 )
+    while( phi_pt < phi_end ) {
+      int mu = 0;
+      FOR4( pr_doublet_PRECISION( pr_pt[mu], phi_pt, mu, -1 ); pr_pt[mu]+=site_var/2; mu++;);
+      phi_pt += site_var;
+    }
+  else 
+#endif
+    while( phi_pt < phi_end ) {
+      int mu = 0;
+      FOR4( pr_PRECISION( pr_pt[mu], phi_pt, mu, -1 ); pr_pt[mu]+=site_var/2; mu++;);
+      phi_pt += site_var;
+    }
+
+#else
+
+  PRECISION sign_re[4*3*SIMD_LENGTH_PRECISION];
+  PRECISION sign_im[4*3*SIMD_LENGTH_PRECISION];
+  int index_re[4*3*SIMD_LENGTH_PRECISION];
+  int index_im[4*3*SIMD_LENGTH_PRECISION];
+
+  int j=0;
+  for ( int mu=0; mu<4; mu++ )
+    for ( int i=0; i<3*SIMD_LENGTH_PRECISION; i++, j++ ) {
+      int spin = (i%6)/3;
+      sign_re[j] = creal(gamma_val_PRECISION[mu][spin])+creal(I*gamma_val_PRECISION[mu][spin]);
+      sign_im[j] = cimag(gamma_val_PRECISION[mu][spin])+cimag(I*gamma_val_PRECISION[mu][spin]);
+      index_re[j] = 6*gamma_co[mu][spin] + gamma_offset[mu][spin] + 0 + 2*(i%(site_var/2));
+      index_im[j] = 6*gamma_co[mu][spin] - gamma_offset[mu][spin] + 1 + 2*(i%(site_var/2));
+    }
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+  } else 
+#endif  
+    while( phi_pt < phi_end ) {
+      mm_PRECISION phi_pt1_re[3], phi_pt1_im[3];
+      mm_loadi_6times_float( (PRECISION*) phi_pt+0, &(phi_pt1_re[0]), &(phi_pt1_re[1]), &(phi_pt1_re[2]), 2, 24 );
+      mm_loadi_6times_float( (PRECISION*) phi_pt+1, &(phi_pt1_im[0]), &(phi_pt1_im[1]), &(phi_pt1_im[2]), 2, 24 );
+      for ( int mu=0; mu<4; mu++ )
+        for ( int i=0; i<3; i++ ) {
+          mm_PRECISION phi_pt2_re = mm_set_from_list( (PRECISION*) phi_pt+0, &(sign_re[(mu*3+i)*SIMD_LENGTH_PRECISION]), &(index_re[(mu*3+i)*SIMD_LENGTH_PRECISION]) );
+          mm_PRECISION phi_pt2_im = mm_set_from_list( (PRECISION*) phi_pt+1, &(sign_im[(mu*3+i)*SIMD_LENGTH_PRECISION]), &(index_im[(mu*3+i)*SIMD_LENGTH_PRECISION]) );
+          mm_PRECISION res_re = mm_sub_PRECISION( phi_pt1_re, phi_pt2_re );
+          mm_PRECISION res_im = mm_sub_PRECISION( phi_pt1_im, phi_pt2_im );
+          cstore_PRECISION( res_re, res_im, pr[mu] );
+          pr[mu] += SIMD_LENGTH_PRECISION;
+        }
+      phi_pt += 4*3*SIMD_LENGTH_PRECISION;
+    }
+  
+  
+#endif
+} 
+*/ 
   // 1 - gamma_T
   static inline void prp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
     prp_pt[0] = l_pt[0] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO];
@@ -313,6 +434,370 @@
     l_pt[11] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[3*GAMMA_X_SPIN3_CO+2];
   }
 
+//START
+#ifdef HAVE_TM1p1
+
+//#define flav_gamma(k) ((k)>1?((k)*3+6):((k)*3))
+#define flav_gamma(k) (3*(k)+6*((k)/2))
+
+  // 1 - gamma_T
+  static inline void dprp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+    prp_pt[ 0] = l_pt[ 0] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)];
+    prp_pt[ 1] = l_pt[ 1] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1];
+    prp_pt[ 2] = l_pt[ 2] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2];
+    prp_pt[ 3] = l_pt[ 3] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)];
+    prp_pt[ 4] = l_pt[ 4] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+1];
+    prp_pt[ 5] = l_pt[ 5] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+2];
+    prp_pt[ 6] = l_pt[ 6] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+6];
+    prp_pt[ 7] = l_pt[ 7] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+7];
+    prp_pt[ 8] = l_pt[ 8] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+8];
+    prp_pt[ 9] = l_pt[ 9] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+6];
+    prp_pt[10] = l_pt[10] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+7];
+    prp_pt[11] = l_pt[11] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+8];
+  }
+
+  // 1 + gamma_T
+  static inline void dprn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+    prn_pt[ 0] = l_pt[ 0] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)];
+    prn_pt[ 1] = l_pt[ 1] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1];
+    prn_pt[ 2] = l_pt[ 2] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2];
+    prn_pt[ 3] = l_pt[ 3] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)];
+    prn_pt[ 4] = l_pt[ 4] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+1];
+    prn_pt[ 5] = l_pt[ 5] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+2];
+    prn_pt[ 6] = l_pt[ 6] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+6];
+    prn_pt[ 7] = l_pt[ 7] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+7];
+    prn_pt[ 8] = l_pt[ 8] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+8];
+    prn_pt[ 9] = l_pt[ 9] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+6];
+    prn_pt[10] = l_pt[10] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+7];
+    prn_pt[11] = l_pt[11] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+8];
+  }
+
+  // - (1 - gamma_T)
+  static inline void dpbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prp_su3_pt[ 0];
+    l_pt[ 1] -= prp_su3_pt[ 1];
+    l_pt[ 2] -= prp_su3_pt[ 2];
+    l_pt[ 3] -= prp_su3_pt[ 3];
+    l_pt[ 4] -= prp_su3_pt[ 4];
+    l_pt[ 5] -= prp_su3_pt[ 5];
+    l_pt[ 6] -= prp_su3_pt[ 6];
+    l_pt[ 7] -= prp_su3_pt[ 7];
+    l_pt[ 8] -= prp_su3_pt[ 8];
+    l_pt[ 9] -= prp_su3_pt[ 9];
+    l_pt[10] -= prp_su3_pt[10];
+    l_pt[11] -= prp_su3_pt[11];
+    l_pt[12] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)];
+    l_pt[13] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+1];
+    l_pt[14] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+2];
+    l_pt[15] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)];
+    l_pt[16] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+1];
+    l_pt[17] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+2];
+    l_pt[18] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+6];
+    l_pt[19] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+7];
+    l_pt[20] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+8];
+    l_pt[21] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+6];
+    l_pt[22] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+7];
+    l_pt[23] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+8];
+  }
+
+  // -(1 + gamma_T)
+  static inline void dpbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prn_su3_pt[ 0];
+    l_pt[ 1] -= prn_su3_pt[ 1];
+    l_pt[ 2] -= prn_su3_pt[ 2];
+    l_pt[ 3] -= prn_su3_pt[ 3];
+    l_pt[ 4] -= prn_su3_pt[ 4];
+    l_pt[ 5] -= prn_su3_pt[ 5];
+    l_pt[ 6] -= prn_su3_pt[ 6];
+    l_pt[ 7] -= prn_su3_pt[ 7];
+    l_pt[ 8] -= prn_su3_pt[ 8];
+    l_pt[ 9] -= prn_su3_pt[ 9];
+    l_pt[10] -= prn_su3_pt[10];
+    l_pt[11] -= prn_su3_pt[11];
+    l_pt[12] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)];
+    l_pt[13] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+1];
+    l_pt[14] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+2];
+    l_pt[15] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)];
+    l_pt[16] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+1];
+    l_pt[17] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+2];
+    l_pt[18] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+6];
+    l_pt[19] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+7];
+    l_pt[20] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+8];
+    l_pt[21] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+6];
+    l_pt[22] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+7];
+    l_pt[23] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+8];
+  }
+
+
+  // 1 - gamma_Z
+  static inline void dprp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+    prp_pt[ 0] = l_pt[ 0] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)];
+    prp_pt[ 1] = l_pt[ 1] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1];
+    prp_pt[ 2] = l_pt[ 2] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2];
+    prp_pt[ 3] = l_pt[ 3] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)];
+    prp_pt[ 4] = l_pt[ 4] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+1];
+    prp_pt[ 5] = l_pt[ 5] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+2];
+    prp_pt[ 6] = l_pt[ 6] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+6];
+    prp_pt[ 7] = l_pt[ 7] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+7];
+    prp_pt[ 8] = l_pt[ 8] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+8];
+    prp_pt[ 9] = l_pt[ 9] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+6];
+    prp_pt[10] = l_pt[10] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+7];
+    prp_pt[11] = l_pt[11] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+8];
+  }
+
+  // 1 + gamma_Z
+  static inline void dprn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+    prn_pt[ 0] = l_pt[ 0] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)];
+    prn_pt[ 1] = l_pt[ 1] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1];
+    prn_pt[ 2] = l_pt[ 2] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2];
+    prn_pt[ 3] = l_pt[ 3] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)];
+    prn_pt[ 4] = l_pt[ 4] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+1];
+    prn_pt[ 5] = l_pt[ 5] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+2];
+    prn_pt[ 6] = l_pt[ 6] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+6];
+    prn_pt[ 7] = l_pt[ 7] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+7];
+    prn_pt[ 8] = l_pt[ 8] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+8];
+    prn_pt[ 9] = l_pt[ 9] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+6];
+    prn_pt[10] = l_pt[10] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+7];
+    prn_pt[11] = l_pt[11] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+8];
+  }
+
+  // - (1 - gamma_Z)
+  static inline void dpbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prp_su3_pt[ 0];
+    l_pt[ 1] -= prp_su3_pt[ 1];
+    l_pt[ 2] -= prp_su3_pt[ 2];
+    l_pt[ 3] -= prp_su3_pt[ 3];
+    l_pt[ 4] -= prp_su3_pt[ 4];
+    l_pt[ 5] -= prp_su3_pt[ 5];
+    l_pt[ 6] -= prp_su3_pt[ 6];
+    l_pt[ 7] -= prp_su3_pt[ 7];
+    l_pt[ 8] -= prp_su3_pt[ 8];
+    l_pt[ 9] -= prp_su3_pt[ 9];
+    l_pt[10] -= prp_su3_pt[10];
+    l_pt[11] -= prp_su3_pt[11];
+    l_pt[12] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)];
+    l_pt[13] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+1];
+    l_pt[14] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+2];
+    l_pt[15] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)];
+    l_pt[16] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+1];
+    l_pt[17] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+2];
+    l_pt[18] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+6];
+    l_pt[19] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+7];
+    l_pt[20] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+8];
+    l_pt[21] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+6];
+    l_pt[22] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+7];
+    l_pt[23] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+8];
+  }
+
+  // -(1 + gamma_Z)
+  static inline void dpbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prn_su3_pt[ 0];
+    l_pt[ 1] -= prn_su3_pt[ 1];
+    l_pt[ 2] -= prn_su3_pt[ 2];
+    l_pt[ 3] -= prn_su3_pt[ 3];
+    l_pt[ 4] -= prn_su3_pt[ 4];
+    l_pt[ 5] -= prn_su3_pt[ 5];
+    l_pt[ 6] -= prn_su3_pt[ 6];
+    l_pt[ 7] -= prn_su3_pt[ 7];
+    l_pt[ 8] -= prn_su3_pt[ 8];
+    l_pt[ 9] -= prn_su3_pt[ 9];
+    l_pt[10] -= prn_su3_pt[10];
+    l_pt[11] -= prn_su3_pt[11];
+    l_pt[12] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)];
+    l_pt[13] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+1];
+    l_pt[14] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+2];
+    l_pt[15] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)];
+    l_pt[16] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+1];
+    l_pt[17] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+2];
+    l_pt[18] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+6];
+    l_pt[19] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+7];
+    l_pt[20] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+8];
+    l_pt[21] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+6];
+    l_pt[22] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+7];
+    l_pt[23] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+8];
+  }
+
+
+  // 1 - gamma_Y
+  static inline void dprp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+    prp_pt[ 0] = l_pt[ 0] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)];
+    prp_pt[ 1] = l_pt[ 1] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1];
+    prp_pt[ 2] = l_pt[ 2] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2];
+    prp_pt[ 3] = l_pt[ 3] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)];
+    prp_pt[ 4] = l_pt[ 4] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+1];
+    prp_pt[ 5] = l_pt[ 5] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+2];
+    prp_pt[ 6] = l_pt[ 6] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+6];
+    prp_pt[ 7] = l_pt[ 7] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+7];
+    prp_pt[ 8] = l_pt[ 8] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+8];
+    prp_pt[ 9] = l_pt[ 9] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+6];
+    prp_pt[10] = l_pt[10] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+7];
+    prp_pt[11] = l_pt[11] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+8];
+  }
+
+  // 1 + gamma_Y
+  static inline void dprn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+    prn_pt[ 0] = l_pt[ 0] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)];
+    prn_pt[ 1] = l_pt[ 1] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1];
+    prn_pt[ 2] = l_pt[ 2] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2];
+    prn_pt[ 3] = l_pt[ 3] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)];
+    prn_pt[ 4] = l_pt[ 4] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+1];
+    prn_pt[ 5] = l_pt[ 5] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+2];
+    prn_pt[ 6] = l_pt[ 6] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+6];
+    prn_pt[ 7] = l_pt[ 7] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+7];
+    prn_pt[ 8] = l_pt[ 8] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+8];
+    prn_pt[ 9] = l_pt[ 9] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+6];
+    prn_pt[10] = l_pt[10] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+7];
+    prn_pt[11] = l_pt[11] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+8];
+  }
+
+  // - (1 - gamma_Y)
+  static inline void dpbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prp_su3_pt[ 0];
+    l_pt[ 1] -= prp_su3_pt[ 1];
+    l_pt[ 2] -= prp_su3_pt[ 2];
+    l_pt[ 3] -= prp_su3_pt[ 3];
+    l_pt[ 4] -= prp_su3_pt[ 4];
+    l_pt[ 5] -= prp_su3_pt[ 5];
+    l_pt[ 6] -= prp_su3_pt[ 6];
+    l_pt[ 7] -= prp_su3_pt[ 7];
+    l_pt[ 8] -= prp_su3_pt[ 8];
+    l_pt[ 9] -= prp_su3_pt[ 9];
+    l_pt[10] -= prp_su3_pt[10];
+    l_pt[11] -= prp_su3_pt[11];
+    l_pt[12] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)];
+    l_pt[13] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+1];
+    l_pt[14] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+2];
+    l_pt[15] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)];
+    l_pt[16] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+1];
+    l_pt[17] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+2];
+    l_pt[18] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+6];
+    l_pt[19] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+7];
+    l_pt[20] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+8];
+    l_pt[21] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+6];
+    l_pt[22] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+7];
+    l_pt[23] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+8];
+  }
+
+  // -(1 + gamma_Y)
+  static inline void dpbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prn_su3_pt[ 0];
+    l_pt[ 1] -= prn_su3_pt[ 1];
+    l_pt[ 2] -= prn_su3_pt[ 2];
+    l_pt[ 3] -= prn_su3_pt[ 3];
+    l_pt[ 4] -= prn_su3_pt[ 4];
+    l_pt[ 5] -= prn_su3_pt[ 5];
+    l_pt[ 6] -= prn_su3_pt[ 6];
+    l_pt[ 7] -= prn_su3_pt[ 7];
+    l_pt[ 8] -= prn_su3_pt[ 8];
+    l_pt[ 9] -= prn_su3_pt[ 9];
+    l_pt[10] -= prn_su3_pt[10];
+    l_pt[11] -= prn_su3_pt[11];
+    l_pt[12] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)];
+    l_pt[13] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+1];
+    l_pt[14] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+2];
+    l_pt[15] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)];
+    l_pt[16] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+1];
+    l_pt[17] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+2];
+    l_pt[18] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+6];
+    l_pt[19] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+7];
+    l_pt[20] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+8];
+    l_pt[21] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+6];
+    l_pt[22] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+7];
+    l_pt[23] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+8];
+  }
+
+
+  // 1 - gamma_X
+  static inline void dprp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) {
+    prp_pt[ 0] = l_pt[ 0] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)];
+    prp_pt[ 1] = l_pt[ 1] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1];
+    prp_pt[ 2] = l_pt[ 2] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2];
+    prp_pt[ 3] = l_pt[ 3] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)];
+    prp_pt[ 4] = l_pt[ 4] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+1];
+    prp_pt[ 5] = l_pt[ 5] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+2];
+    prp_pt[ 6] = l_pt[ 6] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+6];
+    prp_pt[ 7] = l_pt[ 7] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+7];
+    prp_pt[ 8] = l_pt[ 8] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+8];
+    prp_pt[ 9] = l_pt[ 9] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+6];
+    prp_pt[10] = l_pt[10] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+7];
+    prp_pt[11] = l_pt[11] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+8];
+  }
+
+  // 1 + gamma_X
+  static inline void dprn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) {
+    prn_pt[ 0] = l_pt[ 0] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)];
+    prn_pt[ 1] = l_pt[ 1] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1];
+    prn_pt[ 2] = l_pt[ 2] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2];
+    prn_pt[ 3] = l_pt[ 3] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)];
+    prn_pt[ 4] = l_pt[ 4] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+1];
+    prn_pt[ 5] = l_pt[ 5] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+2];
+    prn_pt[ 6] = l_pt[ 6] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+6];
+    prn_pt[ 7] = l_pt[ 7] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+7];
+    prn_pt[ 8] = l_pt[ 8] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+8];
+    prn_pt[ 9] = l_pt[ 9] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+6];
+    prn_pt[10] = l_pt[10] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+7];
+    prn_pt[11] = l_pt[11] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+8];
+  }
+
+  // - (1 - gamma_X)
+  static inline void dpbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prp_su3_pt[ 0];
+    l_pt[ 1] -= prp_su3_pt[ 1];
+    l_pt[ 2] -= prp_su3_pt[ 2];
+    l_pt[ 3] -= prp_su3_pt[ 3];
+    l_pt[ 4] -= prp_su3_pt[ 4];
+    l_pt[ 5] -= prp_su3_pt[ 5];
+    l_pt[ 6] -= prp_su3_pt[ 6];
+    l_pt[ 7] -= prp_su3_pt[ 7];
+    l_pt[ 8] -= prp_su3_pt[ 8];
+    l_pt[ 9] -= prp_su3_pt[ 9];
+    l_pt[10] -= prp_su3_pt[10];
+    l_pt[11] -= prp_su3_pt[11];
+    l_pt[12] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)];
+    l_pt[13] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+1];
+    l_pt[14] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+2];
+    l_pt[15] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)];
+    l_pt[16] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+1];
+    l_pt[17] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+2];
+    l_pt[18] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+6];
+    l_pt[19] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+7];
+    l_pt[20] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+8];
+    l_pt[21] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+6];
+    l_pt[22] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+7];
+    l_pt[23] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+8];
+  }
+
+  // -(1 + gamma_X)
+  static inline void dpbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) {
+    l_pt[ 0] -= prn_su3_pt[ 0];
+    l_pt[ 1] -= prn_su3_pt[ 1];
+    l_pt[ 2] -= prn_su3_pt[ 2];
+    l_pt[ 3] -= prn_su3_pt[ 3];
+    l_pt[ 4] -= prn_su3_pt[ 4];
+    l_pt[ 5] -= prn_su3_pt[ 5];
+    l_pt[ 6] -= prn_su3_pt[ 6];
+    l_pt[ 7] -= prn_su3_pt[ 7];
+    l_pt[ 8] -= prn_su3_pt[ 8];
+    l_pt[ 9] -= prn_su3_pt[ 9];
+    l_pt[10] -= prn_su3_pt[10];
+    l_pt[11] -= prn_su3_pt[11];
+    l_pt[12] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)];
+    l_pt[13] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+1];
+    l_pt[14] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+2];
+    l_pt[15] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)];
+    l_pt[16] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+1];
+    l_pt[17] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+2];
+    l_pt[18] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+6];
+    l_pt[19] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+7];
+    l_pt[20] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+8];
+    l_pt[21] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+6];
+    l_pt[22] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+7];
+    l_pt[23] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+8];
+  }
+
+#endif
+//END
+
   static inline void twospin_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) {
     out_spin0and1[ 0] -= in[ 0];
     out_spin0and1[ 1] -= in[ 1];
@@ -637,6 +1122,158 @@
     out_spin2and3[11] -= in[11];
   }
 
+  static inline void doublet_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) {
+    // diagonal
+    eta[ 0] = clover[ 0]*phi[ 0];
+    eta[ 1] = clover[ 1]*phi[ 1];
+    eta[ 2] = clover[ 2]*phi[ 2];
+    eta[ 3] = clover[ 3]*phi[ 3];
+    eta[ 4] = clover[ 4]*phi[ 4];
+    eta[ 5] = clover[ 5]*phi[ 5];
+    eta[ 6] = clover[ 0]*phi[ 6];
+    eta[ 7] = clover[ 1]*phi[ 7];
+    eta[ 8] = clover[ 2]*phi[ 8];
+    eta[ 9] = clover[ 3]*phi[ 9];
+    eta[10] = clover[ 4]*phi[10];
+    eta[11] = clover[ 5]*phi[11];
+    eta[12] = clover[ 6]*phi[12];
+    eta[13] = clover[ 7]*phi[13];
+    eta[14] = clover[ 8]*phi[14];
+    eta[15] = clover[ 9]*phi[15];
+    eta[16] = clover[10]*phi[16];
+    eta[17] = clover[11]*phi[17];
+    eta[18] = clover[ 6]*phi[18];
+    eta[19] = clover[ 7]*phi[19];
+    eta[20] = clover[ 8]*phi[20];
+    eta[21] = clover[ 9]*phi[21];
+    eta[22] = clover[10]*phi[22];
+    eta[23] = clover[11]*phi[23];
+    // spin 0 and 1 flav 1
+    eta[0] += clover[12]*phi[1];
+    eta[0] += clover[13]*phi[2];
+    eta[0] += clover[14]*phi[3];
+    eta[0] += clover[15]*phi[4];
+    eta[0] += clover[16]*phi[5];
+    eta[1] += clover[17]*phi[2];
+    eta[1] += clover[18]*phi[3];
+    eta[1] += clover[19]*phi[4];
+    eta[1] += clover[20]*phi[5];
+    eta[2] += clover[21]*phi[3];
+    eta[2] += clover[22]*phi[4];
+    eta[2] += clover[23]*phi[5];
+    eta[3] += clover[24]*phi[4];
+    eta[3] += clover[25]*phi[5];
+    eta[4] += clover[26]*phi[5];
+    eta[1] += conj_PRECISION(clover[12])*phi[0];
+    eta[2] += conj_PRECISION(clover[13])*phi[0];
+    eta[3] += conj_PRECISION(clover[14])*phi[0];
+    eta[4] += conj_PRECISION(clover[15])*phi[0];
+    eta[5] += conj_PRECISION(clover[16])*phi[0];
+    eta[2] += conj_PRECISION(clover[17])*phi[1];
+    eta[3] += conj_PRECISION(clover[18])*phi[1];
+    eta[4] += conj_PRECISION(clover[19])*phi[1];
+    eta[5] += conj_PRECISION(clover[20])*phi[1];
+    eta[3] += conj_PRECISION(clover[21])*phi[2];
+    eta[4] += conj_PRECISION(clover[22])*phi[2];
+    eta[5] += conj_PRECISION(clover[23])*phi[2];
+    eta[4] += conj_PRECISION(clover[24])*phi[3];
+    eta[5] += conj_PRECISION(clover[25])*phi[3];
+    eta[5] += conj_PRECISION(clover[26])*phi[4];
+    // spin 0 and 1 flav 2
+    eta[ 6] += clover[12]*phi[ 7];
+    eta[ 6] += clover[13]*phi[ 8];
+    eta[ 6] += clover[14]*phi[ 9];
+    eta[ 6] += clover[15]*phi[10];
+    eta[ 6] += clover[16]*phi[11];
+    eta[ 7] += clover[17]*phi[ 8];
+    eta[ 7] += clover[18]*phi[ 9];
+    eta[ 7] += clover[19]*phi[10];
+    eta[ 7] += clover[20]*phi[11];
+    eta[ 8] += clover[21]*phi[ 9];
+    eta[ 8] += clover[22]*phi[10];
+    eta[ 8] += clover[23]*phi[11];
+    eta[ 9] += clover[24]*phi[10];
+    eta[ 9] += clover[25]*phi[11];
+    eta[10] += clover[26]*phi[11];
+    eta[ 7] += conj_PRECISION(clover[12])*phi[ 6];
+    eta[ 8] += conj_PRECISION(clover[13])*phi[ 6];
+    eta[ 9] += conj_PRECISION(clover[14])*phi[ 6];
+    eta[10] += conj_PRECISION(clover[15])*phi[ 6];
+    eta[11] += conj_PRECISION(clover[16])*phi[ 6];
+    eta[ 8] += conj_PRECISION(clover[17])*phi[ 7];
+    eta[ 9] += conj_PRECISION(clover[18])*phi[ 7];
+    eta[10] += conj_PRECISION(clover[19])*phi[ 7];
+    eta[11] += conj_PRECISION(clover[20])*phi[ 7];
+    eta[ 9] += conj_PRECISION(clover[21])*phi[ 8];
+    eta[10] += conj_PRECISION(clover[22])*phi[ 8];
+    eta[11] += conj_PRECISION(clover[23])*phi[ 8];
+    eta[10] += conj_PRECISION(clover[24])*phi[ 9];
+    eta[11] += conj_PRECISION(clover[25])*phi[ 9];
+    eta[11] += conj_PRECISION(clover[26])*phi[10];
+    // spin 2 and 3 flav 1
+    eta[12] += clover[28]*phi[14];
+    eta[12] += clover[27]*phi[13];
+    eta[12] += clover[29]*phi[15];
+    eta[12] += clover[30]*phi[16];
+    eta[12] += clover[31]*phi[17];
+    eta[13] += clover[32]*phi[14];
+    eta[13] += clover[33]*phi[15];
+    eta[13] += clover[34]*phi[16];
+    eta[13] += clover[35]*phi[17];
+    eta[14] += clover[36]*phi[15];
+    eta[14] += clover[37]*phi[16];
+    eta[14] += clover[38]*phi[17];
+    eta[15] += clover[39]*phi[16];
+    eta[15] += clover[40]*phi[17];
+    eta[16] += clover[41]*phi[17];
+    eta[13] += conj_PRECISION(clover[27])*phi[12];
+    eta[14] += conj_PRECISION(clover[28])*phi[12];
+    eta[15] += conj_PRECISION(clover[29])*phi[12];
+    eta[16] += conj_PRECISION(clover[30])*phi[12];
+    eta[17] += conj_PRECISION(clover[31])*phi[12];
+    eta[14] += conj_PRECISION(clover[32])*phi[13];
+    eta[15] += conj_PRECISION(clover[33])*phi[13];
+    eta[16] += conj_PRECISION(clover[34])*phi[13];
+    eta[17] += conj_PRECISION(clover[35])*phi[13];
+    eta[15] += conj_PRECISION(clover[36])*phi[14];
+    eta[16] += conj_PRECISION(clover[37])*phi[14];
+    eta[17] += conj_PRECISION(clover[38])*phi[14];
+    eta[16] += conj_PRECISION(clover[39])*phi[15];
+    eta[17] += conj_PRECISION(clover[40])*phi[15];
+    eta[17] += conj_PRECISION(clover[41])*phi[16];
+    // spin 2 and 3 flav 2
+    eta[18] += clover[28]*phi[20];
+    eta[18] += clover[27]*phi[19];
+    eta[18] += clover[29]*phi[21];
+    eta[18] += clover[30]*phi[22];
+    eta[18] += clover[31]*phi[23];
+    eta[19] += clover[32]*phi[20];
+    eta[19] += clover[33]*phi[21];
+    eta[19] += clover[34]*phi[22];
+    eta[19] += clover[35]*phi[23];
+    eta[20] += clover[36]*phi[21];
+    eta[20] += clover[37]*phi[22];
+    eta[20] += clover[38]*phi[23];
+    eta[21] += clover[39]*phi[22];
+    eta[21] += clover[40]*phi[23];
+    eta[22] += clover[41]*phi[23];
+    eta[19] += conj_PRECISION(clover[27])*phi[18];
+    eta[20] += conj_PRECISION(clover[28])*phi[18];
+    eta[21] += conj_PRECISION(clover[29])*phi[18];
+    eta[22] += conj_PRECISION(clover[30])*phi[18];
+    eta[23] += conj_PRECISION(clover[31])*phi[18];
+    eta[20] += conj_PRECISION(clover[32])*phi[19];
+    eta[21] += conj_PRECISION(clover[33])*phi[19];
+    eta[22] += conj_PRECISION(clover[34])*phi[19];
+    eta[23] += conj_PRECISION(clover[35])*phi[19];
+    eta[21] += conj_PRECISION(clover[36])*phi[20];
+    eta[22] += conj_PRECISION(clover[37])*phi[20];
+    eta[23] += conj_PRECISION(clover[38])*phi[20];
+    eta[22] += conj_PRECISION(clover[39])*phi[21];
+    eta[23] += conj_PRECISION(clover[40])*phi[21];
+    eta[23] += conj_PRECISION(clover[41])*phi[22];
+  }
+
   static inline void spin0and1_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) {
     // diagonal
     eta[ 0] = clover[ 0]*phi[ 0];
@@ -808,5 +1445,37 @@
     eta[11] += conj_PRECISION(clover[40])*phi[ 9];
     eta[11] += conj_PRECISION(clover[41])*phi[10];
   }
-  
+
+  static inline void site_clover_vectorized_PRECISION( PRECISION *eta, PRECISION *phi, PRECISION *clover ) {
+    mm_PRECISION in_re[3][6];
+    mm_PRECISION in_im[3][6];
+    
+    mm_PRECISION clov_re;
+    mm_PRECISION clov_im;
+    
+    mm_PRECISION out_re;
+    mm_PRECISION out_im;
+
+    for ( int i=0; i<6; i++ ) {
+      mm_loadi_6times_PRECISION( phi+2*i+0, &(in_re[0][i]), &(in_re[1][i]), &(in_re[2][i]), 0, 12 );
+      mm_loadi_6times_PRECISION( phi+2*i+1, &(in_im[0][i]), &(in_im[1][i]), &(in_im[2][i]), 0, 12 );
+    }
+
+    for ( int n=0; n<3; n++ ) {
+      clov_re = mm_load_PRECISION( clover );
+      clov_im = mm_load_PRECISION( clover+SIMD_LENGTH_PRECISION );
+      cmul_PRECISION( clov_re, clov_im, in_re[n][0], in_im[n][0], &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_PRECISION;
+
+      for ( int i=1; i<6; i++ ) {
+        clov_re = mm_load_PRECISION( clover );
+        clov_im = mm_load_PRECISION( clover+SIMD_LENGTH_PRECISION );
+        cfmadd_PRECISION( clov_re, clov_im, in_re[n][i], in_im[n][i], &out_re, &out_im );
+        clover+=2*SIMD_LENGTH_PRECISION;
+      }
+
+      cstore_PRECISION( eta, out_re, out_im );
+      eta+=2*SIMD_LENGTH_PRECISION;
+    }
+  }
 #endif 
diff --git a/src/gathering_generic.c b/src/gathering_generic.c
index e5e4a23..47996d5 100644
--- a/src/gathering_generic.c
+++ b/src/gathering_generic.c
@@ -46,10 +46,14 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
   // define data merging
   // define data gathering permutation
   int i, mu, current_rank, offset, offset_sum,
-      process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL;
+       process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL;
   MALLOC( process_list, int, l->num_processes );
+#ifdef HAVE_TM1p1
+  MALLOC( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+#else
   MALLOC( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var );
-  
+#endif  
+
   l->idle = 0;
   i = 0;
   for ( process_coords[T]=0; process_coords[T]<g.process_grid[T]; process_coords[T]++ )
@@ -57,7 +61,7 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
       for ( process_coords[Y]=0; process_coords[Y]<g.process_grid[Y]; process_coords[Y]++ )
         for ( process_coords[X]=0; process_coords[X]<g.process_grid[X]; process_coords[X]++ ) {
 
-	  g.Cart_rank( g.comm_cart, process_coords, &current_rank );
+          g.Cart_rank( g.comm_cart, process_coords, &current_rank );
 
           offset_sum = 0;
           for ( mu=0; mu<4; mu++ ) {
@@ -67,7 +71,7 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
           }
           // get parent rank
           if ( current_rank == g.my_rank ) {
-	    g.Cart_rank( g.comm_cart, parent_coords, &(l->parent_rank) );
+            g.Cart_rank( g.comm_cart, parent_coords, &(l->parent_rank) );
             // find out if current process is supposed to idle
             if ( offset_sum > 0 )
               l->idle = 1;
@@ -91,7 +95,11 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
     MALLOC( gs->gather_list, int, gs->gather_list_length );
     MALLOC( gs->permutation, int, l->num_inner_lattice_sites );
     MALLOC( gs->reqs, MPI_Request, gs->gather_list_length );
+#ifdef HAVE_TM1p1
+    MALLOC( gs->buffer, complex_PRECISION, 2*l->inner_vector_size );
+#else
     MALLOC( gs->buffer, complex_PRECISION, l->inner_vector_size );
+#endif
     MALLOC( field1, int, l->num_inner_lattice_sites );
     MALLOC( field2, int, l->num_inner_lattice_sites );
     
@@ -112,7 +120,7 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l
             for ( mu=0; mu<4; mu++ )
               process_coords[mu] = g.my_coords[mu] + *(count[mu]) * (l->comm_offset[mu]/merge[mu]);
 
-	    g.Cart_rank( g.comm_cart, process_coords, gs->gather_list + j );
+            g.Cart_rank( g.comm_cart, process_coords, gs->gather_list + j );
 
             j++;
             
@@ -204,32 +212,56 @@ void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l )
     FREE( gs->gather_list, int, gs->gather_list_length );
     FREE( gs->permutation, int, l->num_inner_lattice_sites );
     FREE( gs->reqs, MPI_Request, gs->gather_list_length );
+#ifdef HAVE_TM1p1
+    FREE( gs->buffer, complex_PRECISION, 2*l->inner_vector_size );
+#else
     FREE( gs->buffer, complex_PRECISION, l->inner_vector_size );
+#endif
     MPI_Comm_free( &(gs->level_comm) );
     MPI_Group_free( &(gs->level_comm_group) );
   }
   
+#ifdef HAVE_TM1p1
+  FREE( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+#else
   FREE( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var );
+#endif
 }
 
 
 void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_struct *in, level_struct *l ) {
   
   int send_size_hopp = l->gs_PRECISION.dist_inner_lattice_sites * 4 * SQUARE( l->num_lattice_site_var ),
-      send_size_clov = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var*(l->num_lattice_site_var+1))/2 );
+    send_size_clov = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var*(l->num_lattice_site_var+1))/2 ),
+    send_size_block = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)) );
 #ifdef HAVE_TM
-    int send_size_block = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)) );
+  out->mu = in->mu;
+  out->mu_even_shift = in->mu_even_shift;
+  out->mu_odd_shift = in->mu_odd_shift;
 #endif
-  
+  out->m0 = in->m0;
+#ifdef HAVE_TM1p1
+  out->epsbar = in->epsbar;
+  out->epsbar_ig5_even_shift = in->epsbar_ig5_even_shift;
+  out->epsbar_ig5_odd_shift = in->epsbar_ig5_odd_shift;
+#endif
+
   if ( g.my_rank != l->parent_rank ) {
-    MPI_Request req;
+    MPI_Request req, odd_req;
+#ifdef HAVE_TM1p1
+    MPI_Request eps_req;
+    MPI_Isend( in->epsbar_term, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 4, g.comm_cart, &eps_req );
+#endif
 #ifdef HAVE_TM
-    MPI_Request tm_req, odd_req;
-    MPI_Isend( in->tm_term, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 2, g.comm_cart, &tm_req );
-    MPI_Isend( in->odd_proj, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 3, g.comm_cart, &odd_req );
+    MPI_Request tm_req;
+    MPI_Isend( in->tm_term, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 3, g.comm_cart, &tm_req );
 #endif
+    MPI_Isend( in->odd_proj, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 2, g.comm_cart, &odd_req );
     MPI_Isend( in->D, send_size_hopp, MPI_COMPLEX_PRECISION, l->parent_rank, 0, g.comm_cart, &req );
     MPI_Send( in->clover, send_size_clov, MPI_COMPLEX_PRECISION, l->parent_rank, 1, g.comm_cart );
+#ifdef HAVE_TM1p1
+    MPI_Wait( &eps_req, MPI_STATUS_IGNORE );
+#endif
 #ifdef HAVE_TM
     MPI_Wait( &tm_req, MPI_STATUS_IGNORE );
     MPI_Wait( &odd_req, MPI_STATUS_IGNORE );
@@ -238,43 +270,54 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s
   } else {
     int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites,
         t, *pi = l->gs_PRECISION.permutation;
-    vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL;
-    MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL;
+    vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL;
+    MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL, *odd_proj_reqs = NULL;
     
+#ifdef HAVE_TM1p1
+    vector_PRECISION buffer_eps_term = NULL;
+    MPI_Request *eps_term_reqs = NULL;
+    MALLOC( buffer_eps_term, complex_PRECISION, n*send_size_block );
+    MALLOC( eps_term_reqs, MPI_Request, n );
+#endif
 #ifdef HAVE_TM
-    vector_PRECISION buffer_tm_term = NULL, buffer_odd_proj = NULL;
-    MPI_Request *tm_term_reqs = NULL, *odd_proj_reqs = NULL;
+    vector_PRECISION buffer_tm_term = NULL;
+    MPI_Request *tm_term_reqs = NULL;
     MALLOC( buffer_tm_term, complex_PRECISION, n*send_size_block );
-    MALLOC( buffer_odd_proj, complex_PRECISION, n*send_size_block );
     MALLOC( tm_term_reqs, MPI_Request, n );
-    MALLOC( odd_proj_reqs, MPI_Request, n );
 #endif
     MALLOC( buffer_hopp, complex_PRECISION, n*send_size_hopp );
     MALLOC( buffer_clov, complex_PRECISION, n*send_size_clov );
+    MALLOC( buffer_odd_proj, complex_PRECISION, n*send_size_block );
     MALLOC( hopp_reqs, MPI_Request, n );
     MALLOC( clov_reqs, MPI_Request, n );
+    MALLOC( odd_proj_reqs, MPI_Request, n );
     
     PROF_PRECISION_START( _GD_COMM );
     for ( i=1; i<n; i++ ) {
+#ifdef HAVE_TM1p1
+      MPI_Irecv( buffer_eps_term+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION,
+                 l->gs_PRECISION.gather_list[i], 4, g.comm_cart, &(eps_term_reqs[i]) );
+#endif
 #ifdef HAVE_TM
       MPI_Irecv( buffer_tm_term+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION,
-                 l->gs_PRECISION.gather_list[i], 2, g.comm_cart, &(tm_term_reqs[i]) );
-      MPI_Irecv( buffer_odd_proj+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION,
-                 l->gs_PRECISION.gather_list[i], 3, g.comm_cart, &(odd_proj_reqs[i]) );      
+                 l->gs_PRECISION.gather_list[i], 3, g.comm_cart, &(tm_term_reqs[i]) );
 #endif
       MPI_Irecv( buffer_hopp+i*send_size_hopp, send_size_hopp, MPI_COMPLEX_PRECISION,
                  l->gs_PRECISION.gather_list[i], 0, g.comm_cart, &(hopp_reqs[i]) );
       MPI_Irecv( buffer_clov+i*send_size_clov, send_size_clov, MPI_COMPLEX_PRECISION,
                  l->gs_PRECISION.gather_list[i], 1, g.comm_cart, &(clov_reqs[i]) );
+      MPI_Irecv( buffer_odd_proj+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION,
+                 l->gs_PRECISION.gather_list[i], 2, g.comm_cart, &(odd_proj_reqs[i]) );      
     }
     PROF_PRECISION_STOP( _GD_COMM, 2*n-2 );
-    
-#ifdef HAVE_TM
+
+#ifdef HAVE_TM1p1
     for ( i=0; i<send_size_block; i++ )
-      buffer_tm_term[i] = in->tm_term[i];
-    
+      buffer_eps_term[i] = in->epsbar_term[i];
+#endif
+#ifdef HAVE_TM
     for ( i=0; i<send_size_block; i++ )
-      buffer_odd_proj[i] = in->odd_proj[i];
+      buffer_tm_term[i] = in->tm_term[i];    
 #endif
     
     for ( i=0; i<send_size_hopp; i++ )
@@ -283,26 +326,30 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s
     for ( i=0; i<send_size_clov; i++ )
       buffer_clov[i] = in->clover[i];
     
-#ifdef HAVE_TM
+    for ( i=0; i<send_size_block; i++ )
+      buffer_odd_proj[i] = in->odd_proj[i];
+
+#ifdef HAVE_TM1p1
     PROF_PRECISION_START( _GD_IDLE );
     for ( i=1; i<n; i++ )
-      MPI_Wait( &(tm_term_reqs[i]), MPI_STATUS_IGNORE );
+      MPI_Wait( &(eps_term_reqs[i]), MPI_STATUS_IGNORE );
     PROF_PRECISION_STOP( _GD_IDLE, n-1 );
     
     t = (send_size_block*n)/s;
     for ( i=0; i<s; i++ )
       for ( j=0; j<t; j++ )
-        out->tm_term[ t*pi[i] + j ] = buffer_tm_term[ t*i + j ];
-
+        out->epsbar_term[ t*pi[i] + j ] = buffer_eps_term[ t*i + j ];
+#endif
+#ifdef HAVE_TM
     PROF_PRECISION_START( _GD_IDLE );
     for ( i=1; i<n; i++ )
-      MPI_Wait( &(odd_proj_reqs[i]), MPI_STATUS_IGNORE );
+      MPI_Wait( &(tm_term_reqs[i]), MPI_STATUS_IGNORE );
     PROF_PRECISION_STOP( _GD_IDLE, n-1 );
     
     t = (send_size_block*n)/s;
     for ( i=0; i<s; i++ )
       for ( j=0; j<t; j++ )
-        out->odd_proj[ t*pi[i] + j ] = buffer_odd_proj[ t*i + j ];
+        out->tm_term[ t*pi[i] + j ] = buffer_tm_term[ t*i + j ];
 #endif
 
     PROF_PRECISION_START( _GD_IDLE );
@@ -324,20 +371,34 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s
     for ( i=0; i<s; i++ )
       for ( j=0; j<t; j++ )
         out->clover[ t*pi[i] + j ] = buffer_clov[ t*i + j ];
+
+    PROF_PRECISION_START( _GD_IDLE );
+    for ( i=1; i<n; i++ )
+      MPI_Wait( &(odd_proj_reqs[i]), MPI_STATUS_IGNORE );
+    PROF_PRECISION_STOP( _GD_IDLE, n-1 );
+    
+    t = (send_size_block*n)/s;
+    for ( i=0; i<s; i++ )
+      for ( j=0; j<t; j++ )
+        out->odd_proj[ t*pi[i] + j ] = buffer_odd_proj[ t*i + j ];
       
     FREE( buffer_hopp, complex_PRECISION, n*send_size_hopp );
     FREE( buffer_clov, complex_PRECISION, n*send_size_clov );
+    FREE( buffer_odd_proj, complex_PRECISION, n*send_size_block );
     FREE( hopp_reqs, MPI_Request, n );
     FREE( clov_reqs, MPI_Request, n );
+    FREE( odd_proj_reqs, MPI_Request, n );
 #ifdef HAVE_TM
     FREE( buffer_tm_term, complex_PRECISION, n*send_size_block );
-    FREE( buffer_odd_proj, complex_PRECISION, n*send_size_block );
     FREE( tm_term_reqs, MPI_Request, n );
-    FREE( odd_proj_reqs, MPI_Request, n );
 #endif
+#ifdef HAVE_TM1p1
+    FREE( buffer_eps_term, complex_PRECISION, n*send_size_block );
+    FREE( eps_term_reqs, MPI_Request, n );
+#endif
+
   }
   l->dummy_p_PRECISION.op = out;
-  l->dummy_p_PRECISION.shift = 0;
   l->dummy_p_PRECISION.v_start = 0;
   l->dummy_p_PRECISION.v_end = l->inner_vector_size;
   
diff --git a/src/ghost_generic.c b/src/ghost_generic.c
index 8db1cfc..5a423a0 100644
--- a/src/ghost_generic.c
+++ b/src/ghost_generic.c
@@ -26,7 +26,7 @@ void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECI
   if( l->global_splitting[mu] > 1 ) {    
     
     int i, j, num_boundary_sites = c->num_boundary_sites[2*mu+1], boundary_start,
-        *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var;
+      *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var;
     
     vector_PRECISION buffer, tmp_pt, buffer_pt;
     
@@ -102,6 +102,10 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str
     if ( g.method < 5 )
       factor = 2;
   }
+
+#ifdef HAVE_TM1p1
+  factor *= 2;
+#endif
   
   if ( buffer_size <= 0 ) {
     c->comm_start[0] = c->offset*l->num_inner_lattice_sites;
@@ -128,13 +132,22 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str
   } else {
     for ( mu=0; mu<4; mu++ ) {
       c->max_length[mu] = buffer_size;
+#ifdef HAVE_TM1p1
+      MALLOC( c->buffer[2*mu], complex_PRECISION, 2*buffer_size );
+      MALLOC( c->buffer[2*mu+1], complex_PRECISION, 2*buffer_size );
+#else
       MALLOC( c->buffer[2*mu], complex_PRECISION, buffer_size );
       MALLOC( c->buffer[2*mu+1], complex_PRECISION, buffer_size );
+#endif
     }
   }
   
   if ( l->vbuf_PRECISION[8] == NULL ) {
+#ifdef HAVE_TM1p1
+    MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size );
+#else
     MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size );
+#endif
   }
 }
 
@@ -149,7 +162,11 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) {
   }
   
   if ( l->vbuf_PRECISION[8] != NULL ) {
+#ifdef HAVE_TM1p1
+    FREE( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size );
+#else
     FREE( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size );
+#endif
   }
 }
 
@@ -194,6 +211,15 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir
       table_start = c->num_even_boundary_sites[mu_dir];
     }
     
+#ifdef HAVE_TM1p1
+    if ( g.n_flavours == 2 ) {
+      length[0] *= 2;
+      length[1] *= 2;
+      comm_start *= 2;
+      offset *= 2;
+    }
+#endif
+
     ASSERT( c->in_use[mu_dir] == 0 );
     c->in_use[mu_dir] = 1;
     
@@ -270,6 +296,11 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir,
     int mu_dir = 2*mu-MIN(dir,0);
     int i, j, *table, offset = c->offset, length[2]={0,0}, table_start = 0;
     vector_PRECISION buffer, phi_pt;
+
+#ifdef HAVE_TM1p1
+    if ( g.n_flavours == 2 )
+      offset *= 2;
+#endif
       
     if ( amount == _FULL_SYSTEM ) {
       length[0] = (c->num_boundary_sites[2*mu])*offset;
@@ -284,7 +315,7 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir,
       length[1] = c->num_odd_boundary_sites[2*mu+1]*offset;
       table_start = c->num_even_boundary_sites[mu_dir];
     }
-    
+
     ASSERT( c->in_use[mu_dir] == 1 );
     
     if ( dir == 1 ) {
diff --git a/src/gram_schmidt_generic.c b/src/gram_schmidt_generic.c
new file mode 100644
index 0000000..71eb26c
--- /dev/null
+++ b/src/gram_schmidt_generic.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#include "main.h"
+
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION  
+void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vect, level_struct *l, struct Thread *threading ) 
+#else
+void gram_schmidt_on_aggregates_PRECISION( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading )
+#endif
+{
+
+  PROF_PRECISION_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading );
+
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION  
+  SYNC_CORES(threading)
+  SYNC_HYPERTHREADS(threading)
+  int i, j, k, k1, k2, num_aggregates = l->s_PRECISION.num_aggregates,
+      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
+      
+  complex_PRECISION alpha1, alpha2;
+  vector_PRECISION v_pt1, v_pt2;
+  PRECISION norm1, norm2;
+      
+  for ( j=threading->n_thread*threading->core+threading->thread; j<num_aggregates; j+=threading->n_thread*threading->n_core ) {
+    for ( k1=0; k1<num_vect; k1++ ) {
+      v_pt1 = V[k1] + j*aggregate_size;
+      
+      for ( k2=0; k2<k1; k2++ ) {
+        v_pt2 = V[k2] + j*aggregate_size;
+        alpha1 = 0; alpha2 = 0;
+        // V[k1] -= <V[k2],V[k1]> V[k2] | 2*j-th and 2*j+1-st aggregate
+        for ( i=0; i<aggregate_size; ) {
+          for ( k=0; k<offset; k++, i++ )
+            alpha1 += conj_PRECISION(v_pt2[i]) * v_pt1[i];
+          for ( k=0; k<offset; k++, i++ )
+            alpha2 += conj_PRECISION(v_pt2[i]) * v_pt1[i];
+        }
+        for ( i=0; i<aggregate_size; ) {
+          for ( k=0; k<offset; k++, i++ )
+            v_pt1[i] -=  alpha1 * v_pt2[i];
+          for ( k=0; k<offset; k++, i++ )
+            v_pt1[i] -=  alpha2 * v_pt2[i];
+        }
+      }
+      
+      norm1 = 0; norm2 = 0;
+      // V[k1] = V[k1]/norm(V[k1]) | 2*j-th and 2*j+1-st aggregate    
+      for ( i=0; i<aggregate_size; ) {
+        for ( k=0; k<offset; k++, i++ )
+          norm1 += NORM_SQUARE_PRECISION(v_pt1[i]);
+        for ( k=0; k<offset; k++, i++ )
+          norm2 += NORM_SQUARE_PRECISION(v_pt1[i]);
+      }
+      norm1 = 1/sqrt(norm1); norm2 = 1/sqrt(norm2);
+      for ( i=0; i<aggregate_size; ) {
+        for ( k=0; k<offset; k++, i++ )
+          v_pt1[i] =  norm1 * creal_PRECISION(v_pt1[i]) + I*norm1* cimag_PRECISION(v_pt1[i]);
+        for ( k=0; k<offset; k++, i++ )
+          v_pt1[i] =  norm2 * creal_PRECISION(v_pt1[i]) + I*norm2* cimag_PRECISION(v_pt1[i]);
+      }
+    }
+  }
+  SYNC_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+
+#else
+
+  // Here the ortogonalization is done on a operator layout.
+  // the block version has some optimizations which are correct only on the fine grid
+  if(l->depth == 0)
+    block_gram_schmidt_PRECISION( V, num_vec, l, threading );  
+  else
+    aggregate_gram_schmidt_PRECISION( V, num_vec, l, threading );
+#endif
+
+  PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading );
+}
+
+
+void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int begin, const int n, level_struct *l, struct Thread *threading ) {
+  
+  // NOTE: only thread safe, if "buffer" is the same buffer for all threads belonging to a common MPI process
+  START_MASTER(threading)
+  PROF_PRECISION_START( _LA );
+  END_MASTER(threading)
+  SYNC_CORES(threading)
+  
+  PRECISION beta;
+  int i, j, start, end;
+  
+  compute_core_start_end_custom( 0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var );
+  
+  for ( i=begin; i<n; i++ ) {
+    
+    complex_PRECISION tmp[i];
+    process_multi_inner_product_PRECISION( i, tmp, V, V[i], 0, l->inner_vector_size, l, threading );
+    SYNC_CORES(threading)
+    START_MASTER(threading)
+    for ( j=0; j<i; j++ ) {
+      buffer[j] = tmp[j];
+    }
+    END_MASTER(threading)
+    SYNC_MASTER_TO_ALL(threading)
+    
+    if ( i>0 ) {
+      START_MASTER(threading)
+      PROF_PRECISION_START( _ALLR );
+      MPI_Allreduce( buffer, buffer+n, i, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
+      PROF_PRECISION_STOP( _ALLR, 1 );
+      END_MASTER(threading)
+      SYNC_MASTER_TO_ALL(threading)
+    }
+    
+    for( j=0; j<i; j++ ) {
+      vector_PRECISION_saxpy( V[i], V[i], V[j], -(buffer+n)[j], 0, l->inner_vector_size, l, threading );
+      SYNC_CORES(threading)
+    }
+    
+    SYNC_CORES(threading)
+      
+    beta = global_norm_PRECISION( V[i], 0, l->inner_vector_size, l, threading );
+    SYNC_MASTER_TO_ALL(threading)
+    vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l );
+    SYNC_CORES(threading)
+  }
+  
+  START_MASTER(threading)
+  PROF_PRECISION_STOP( _LA, 1 );
+  END_MASTER(threading)
+  SYNC_CORES(threading)
+}
diff --git a/src/gram_schmidt_generic.h b/src/gram_schmidt_generic.h
new file mode 100644
index 0000000..1500157
--- /dev/null
+++ b/src/gram_schmidt_generic.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef GRAM_SCHMIDT_PRECISION_HEADER
+#define GRAM_SCHMIDT_PRECISION_HEADER
+
+// Gram-Schmidt on full vectors
+void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n,
+                             level_struct *l, struct Thread *threading );
+// Gram-Schmidt on aggregates
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION  
+void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec,
+                                           level_struct *l, struct Thread *threading );
+#else // optimized version on the operator layout
+void gram_schmidt_on_aggregates_PRECISION( complex_PRECISION *operator, const int num_vec,
+                                           level_struct *l, struct Thread *threading );
+#endif
+
+#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION  
+
+// SIMD version of gram_schmidt_on_aggregates optimized on a operator layout
+// block_gram_schmidt_PRECISION follows, after definition of others inline void functions marked by "used by *IT*"
+static inline void block_gram_schmidt_PRECISION( complex_PRECISION *V, int num_vec, level_struct *l, 
+                                                 struct Thread *threading );
+
+static inline void aggregate_gram_schmidt_PRECISION( complex_PRECISION *V, const int num_vec, 
+                                                     level_struct *l, struct Thread *threading ) {
+
+  SYNC_CORES(threading)
+  SYNC_HYPERTHREADS(threading)
+  int i, j, k, k1, k2, k3, num_aggregates = l->s_PRECISION.num_aggregates,
+      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
+      
+  PRECISION *v_pt1;
+  PRECISION *v_pt2;
+  PRECISION norm1, norm2;
+  PRECISION next_norm1;
+  PRECISION next_norm2;
+  int ldv = SIMD_LENGTH_PRECISION;
+  int V_block_offset = 2*l->vector_size; 
+  
+  for ( j=threading->n_thread*threading->core+threading->thread; j<num_aggregates; j+=threading->n_thread*threading->n_core ) {
+
+    v_pt1 = (PRECISION *)V + 0 + j*aggregate_size*2*ldv;
+
+    next_norm1 = 0.0;
+    next_norm2 = 0.0;
+    for ( i=0; i<aggregate_size; ) {
+      for ( k=0; k<offset; k++, i++ ) {
+        PRECISION *tmp = v_pt1 + i*2*ldv;
+        next_norm1 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
+      }
+      for ( k=0; k<offset; k++, i++ ) {
+        PRECISION *tmp = v_pt1 + i*2*ldv;
+        next_norm2 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
+      }
+    }
+    for ( k1=0; k1<num_vec; k1++ ) {
+      v_pt1 = (PRECISION *)V + (k1/ldv)*V_block_offset*ldv + k1%ldv + j*aggregate_size*2*ldv;
+      v_pt2 = (PRECISION *)V + j*aggregate_size*2*ldv;
+
+      norm1 = 1.0/sqrt(next_norm1);
+      norm2 = 1.0/sqrt(next_norm2);
+      next_norm1 = 0.0;
+      next_norm2 = 0.0;
+
+      mm_PRECISION alpha1_re[OPERATOR_COMPONENT_OFFSET_PRECISION/SIMD_LENGTH_PRECISION];
+      mm_PRECISION alpha1_im[OPERATOR_COMPONENT_OFFSET_PRECISION/SIMD_LENGTH_PRECISION];
+      mm_PRECISION alpha2_re[OPERATOR_COMPONENT_OFFSET_PRECISION/SIMD_LENGTH_PRECISION];
+      mm_PRECISION alpha2_im[OPERATOR_COMPONENT_OFFSET_PRECISION/SIMD_LENGTH_PRECISION];
+      mm_PRECISION v1_re;
+      mm_PRECISION v1_im;
+      mm_PRECISION v2_re;
+      mm_PRECISION v2_im;
+
+      for ( k2=0; k2<num_vec; k2+=SIMD_LENGTH_PRECISION ) {
+        alpha1_re[k2/SIMD_LENGTH_PRECISION] = mm_setzero_PRECISION();
+        alpha1_im[k2/SIMD_LENGTH_PRECISION] = mm_setzero_PRECISION();
+        alpha2_re[k2/SIMD_LENGTH_PRECISION] = mm_setzero_PRECISION();
+        alpha2_im[k2/SIMD_LENGTH_PRECISION] = mm_setzero_PRECISION();
+      }
+      for ( i=0; i<aggregate_size; ) {
+        // normalize v1 by scaling with previously computed factor
+        // this is fused into this dotp loop, to avoid loading everything twice
+        for ( k=0; k<offset; k++, i++ ) {
+          PRECISION *tmp = v_pt1 + i*2*ldv;
+          tmp[0]   *= norm1;
+          tmp[ldv] *= norm1;
+        }
+        for ( k=0; k<offset; k++, i++ ) {
+          PRECISION *tmp = v_pt1 + i*2*ldv;
+          tmp[0]   *= norm2;
+          tmp[ldv] *= norm2;
+        }
+        i -= 2*offset;
+        // done normalizing
+
+        for ( k=0; k<offset; k++, i++ ) {
+          v1_re = mm_set1_PRECISION(v_pt1[(2*i+0)*ldv]);
+          v1_im = mm_set1_PRECISION(v_pt1[(2*i+1)*ldv]);
+          for ( k2=0; k2<OPERATOR_COMPONENT_OFFSET_PRECISION; k2+=SIMD_LENGTH_PRECISION ) {
+            v2_re = mm_load_PRECISION(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
+            v2_im = mm_load_PRECISION(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
+            cfmadd_conj_PRECISION(v1_re, v1_im, v2_re, v2_im, &alpha1_re[k2/SIMD_LENGTH_PRECISION], &alpha1_im[k2/SIMD_LENGTH_PRECISION]);
+          }
+        }
+        for ( k=0; k<offset; k++, i++ ) {
+          v1_re = mm_set1_PRECISION(v_pt1[(2*i+0)*ldv]);
+          v1_im = mm_set1_PRECISION(v_pt1[(2*i+1)*ldv]);
+          for ( k2=0; k2<OPERATOR_COMPONENT_OFFSET_PRECISION; k2+=SIMD_LENGTH_PRECISION ) {
+            v2_re = mm_load_PRECISION(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
+            v2_im = mm_load_PRECISION(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
+            cfmadd_conj_PRECISION(v1_re, v1_im, v2_re, v2_im, &alpha2_re[k2/SIMD_LENGTH_PRECISION], &alpha2_im[k2/SIMD_LENGTH_PRECISION]);
+          }
+        }
+      }
+
+      for ( i=0; i<aggregate_size; ) {
+        for ( k=0; k<offset; k++, i++ ) {
+          
+          v1_re = mm_set1_PRECISION(v_pt1[(2*i+0)*ldv]);
+          v1_im = mm_set1_PRECISION(v_pt1[(2*i+1)*ldv]);
+
+          
+          for ( k2=(k1/SIMD_LENGTH_PRECISION)*SIMD_LENGTH_PRECISION; k2<OPERATOR_COMPONENT_OFFSET_PRECISION; k2+=SIMD_LENGTH_PRECISION ) {
+
+            v2_re = mm_load_PRECISION(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
+            v2_im = mm_load_PRECISION(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
+            
+            if(k2 < k1+1) {
+              PRECISION mask[SIMD_LENGTH_PRECISION] __attribute__((aligned(sizeof(PRECISION)*SIMD_LENGTH_PRECISION)));
+              memset( mask, 255, sizeof(PRECISION)*SIMD_LENGTH_PRECISION );
+              
+              // emulate storing mask
+              for ( k3=k2; k3<MIN(k1+1,k2+SIMD_LENGTH_PRECISION); k3++ ) {
+                memset( mask+k3-k2, 0, sizeof(PRECISION) );
+              }
+              
+              mm_PRECISION maskreg = mm_load_PRECISION(mask);
+              
+              masked_cfnmadd_PRECISION(alpha1_re[k2/SIMD_LENGTH_PRECISION], alpha1_im[k2/SIMD_LENGTH_PRECISION], v1_re, v1_im, &v2_re, &v2_im, maskreg );
+            } else {
+              cfnmadd_PRECISION(alpha1_re[k2/SIMD_LENGTH_PRECISION], alpha1_im[k2/SIMD_LENGTH_PRECISION], v1_re, v1_im, &v2_re, &v2_im);
+            }
+             
+            mm_store_PRECISION(v_pt2 + (2*i+0)*ldv + k2*V_block_offset, v2_re);
+            mm_store_PRECISION(v_pt2 + (2*i+1)*ldv + k2*V_block_offset, v2_im);
+            
+          }
+        }
+        for ( k=0; k<offset; k++, i++ ) {
+          
+          v1_re = mm_set1_PRECISION(v_pt1[(2*i+0)*ldv]);
+          v1_im = mm_set1_PRECISION(v_pt1[(2*i+1)*ldv]);
+          
+          for ( k2=(k1/SIMD_LENGTH_PRECISION)*SIMD_LENGTH_PRECISION; k2<OPERATOR_COMPONENT_OFFSET_PRECISION; k2+=SIMD_LENGTH_PRECISION ) {
+            
+            v2_re = mm_load_PRECISION(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
+            v2_im = mm_load_PRECISION(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
+            
+            if(k2 < k1+1) {
+              PRECISION mask[SIMD_LENGTH_PRECISION] __attribute__((aligned(sizeof(PRECISION)*SIMD_LENGTH_PRECISION)));
+              memset( mask, 255, sizeof(PRECISION)*SIMD_LENGTH_PRECISION );
+          
+              // emulate storing mask
+              for ( k3=k2; k3<MIN(k1+1,k2+SIMD_LENGTH_PRECISION); k3++ ) {
+                memset( mask+k3-k2, 0, sizeof(PRECISION) );
+              }         
+              
+              mm_PRECISION maskreg = mm_load_PRECISION(mask);
+              
+              masked_cfnmadd_PRECISION(alpha2_re[k2/SIMD_LENGTH_PRECISION], alpha2_im[k2/SIMD_LENGTH_PRECISION], v1_re, v1_im, &v2_re, &v2_im, maskreg );
+            } else {
+              cfnmadd_PRECISION(alpha2_re[k2/SIMD_LENGTH_PRECISION], alpha2_im[k2/SIMD_LENGTH_PRECISION], v1_re, v1_im, &v2_re, &v2_im);
+            }
+             
+            mm_store_PRECISION(v_pt2 + (2*i+0)*ldv + k2*V_block_offset, v2_re);
+            mm_store_PRECISION(v_pt2 + (2*i+1)*ldv + k2*V_block_offset, v2_im);
+            
+          }
+        }
+        // compute norm of v_{k1+1}
+        // this is fused into this axpy loop, to avoid loading everything twice
+        if ( k1+1<num_vec ) {
+          PRECISION *v_pt = (PRECISION *)V + ((k1+1)/ldv)*V_block_offset*ldv + (k1+1)%ldv + j*aggregate_size*2*ldv;
+          i -= 2*offset;
+          for ( k=0; k<offset; k++, i++ ) {
+            PRECISION *tmp = v_pt + i*2*ldv;
+            next_norm1 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
+          }
+          for ( k=0; k<offset; k++, i++ ) {
+            PRECISION *tmp = v_pt + i*2*ldv;
+            next_norm2 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
+          }
+        }
+        // end compute norm
+      }
+    }
+  }
+  
+  SYNC_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+}
+
+// used by block_gram_schmidt_PRECISION
+static inline void aggregate_gram_schmidt_block_PRECISION( PRECISION *V, int num_vec, int leading_dimension, 
+                                                           level_struct *l, struct Thread *threading ) {
+
+  START_NO_HYPERTHREADS(threading)
+  SYNC_CORES(threading)
+  int i, j, k, k1, k2, num_aggregates = l->s_PRECISION.num_aggregates,
+      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
+
+  PRECISION *v_pt1;
+  PRECISION *v_pt2;
+  PRECISION norm;
+  PRECISION next_norm;
+  int ldv = leading_dimension;
+  //offset = 6;
+
+
+  // current thread chooses an aggregate
+  for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) {
+    j = jp/2;
+    int component = jp%2;
+
+
+    v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv;
+
+    next_norm = 0.0;
+
+    // for the whole aggregate
+    for ( i=0; i<aggregate_size; ) {
+
+      // for either the first or the second half of variables
+      // (depending on the value of "component")
+      for ( k=0; k<offset; k++, i++ ) {
+        // data layout contains ldv real parts
+        // and thereafter ldv imag parts
+        PRECISION *tmp = v_pt1 + i*2*ldv;
+        // adds square of real part and square of imaginary part to current norm
+        next_norm += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
+      }
+      // skip the other half of variables
+      i += offset;
+    } // i loop
+
+    // for all test vectors
+    for ( k1=0; k1<num_vec; k1++ ) {
+      // v_pt1 =  [ component*offset*number of test vectors 
+      //          + current test vector index
+      //          + current aggregate block of test vectors ] * complex
+      //
+      // means: current vector
+      v_pt1 = V + 2*component*offset*ldv + k1 + j*aggregate_size*2*ldv;
+      // v_pt2 =  [ component*offset*number of test vectors
+      //          + current aggregate block of test vectors ] * complex
+      //
+      // means: first vector
+      v_pt2 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv;
+
+      norm = 1.0/sqrt(next_norm);
+      next_norm = 0.0;
+
+      mm_PRECISION alpha_re;
+      mm_PRECISION alpha_im;
+      mm_PRECISION v1_re;
+      mm_PRECISION v1_im;
+      mm_PRECISION v2_re;
+      mm_PRECISION v2_im;
+
+      alpha_re = mm_setzero_PRECISION();
+      alpha_im = mm_setzero_PRECISION();
+      for ( i=0; i<aggregate_size; ) {
+        // normalize v1 by scaling with previously computed factor
+        // this is fused into this dotp loop, to avoid loading everything twice
+        {
+          for ( k=0; k<offset; k++, i++ ) {
+            PRECISION *tmp = v_pt1 + i*2*ldv;
+            tmp[0]   *= norm;
+            tmp[ldv] *= norm;
+          }
+          i += offset;
+          i -= 2*offset;
+        }
+        // done normalizing current vector
+
+        // calculate inner product of v_pt1 and v_pt2
+        for ( k=0; k<offset; k++, i++ ) {
+          v1_re = mm_set1_PRECISION(v_pt1[(2*i+0)*ldv]);
+          v1_im = mm_set1_PRECISION(v_pt1[(2*i+1)*ldv]);
+          v2_re = mm_load_PRECISION(v_pt2 + (2*i+0)*ldv);
+          v2_im = mm_load_PRECISION(v_pt2 + (2*i+1)*ldv);
+          cfmadd_conj_PRECISION(v1_re, v1_im, v2_re, v2_im, &alpha_re, &alpha_im);
+        }
+        i += offset;
+      } // i loop
+
+      if(k1 == num_vec-1)
+        break; // break k1 loop
+
+      for ( i=0; i<aggregate_size; ) {
+        for ( k=0; k<offset; k++, i++ ) {
+          v1_re = mm_set1_PRECISION(v_pt1[(2*i+0)*ldv]);
+          v1_im = mm_set1_PRECISION(v_pt1[(2*i+1)*ldv]);
+          
+          PRECISION buffer[SIMD_LENGTH_PRECISION] __attribute__((aligned(sizeof(PRECISION)*SIMD_LENGTH_PRECISION)));
+          
+          mm_store_PRECISION( buffer, alpha_re );
+          for ( k2=0; k2<MIN(k1+1,SIMD_LENGTH_PRECISION); k2++ )
+            buffer[k2]=0.0;
+          alpha_re = mm_load_PRECISION(buffer);
+          
+          mm_store_PRECISION( buffer, alpha_im );
+          for ( k2=0; k2<MIN(k1+1,SIMD_LENGTH_PRECISION); k2++ )
+            buffer[k2]=0.0;
+          alpha_im = mm_load_PRECISION(buffer);
+          
+          v2_re = mm_load_PRECISION(v_pt2 + (2*i+0)*ldv);
+          v2_im = mm_load_PRECISION(v_pt2 + (2*i+1)*ldv);
+          cfnmadd_PRECISION(alpha_re, alpha_im, v1_re, v1_im, &v2_re, &v2_im);
+          mm_store_PRECISION(v_pt2 + (2*i+0)*ldv, v2_re);
+          mm_store_PRECISION(v_pt2 + (2*i+1)*ldv, v2_im);
+        }
+        i += offset;
+        // compute norm of v_{k1+1}
+        // this is fused into this axpy loop, to avoid loading everything twice
+        {
+          PRECISION *v_pt = V + 2*component*offset*ldv + k1+1 + j*aggregate_size*2*ldv;
+          i -= 2*offset;
+          for ( k=0; k<offset; k++, i++ ) {
+            PRECISION *tmp = v_pt + i*2*ldv;
+            next_norm += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
+          }
+          i += offset;
+        }
+          // end compute norm
+      } // i loop
+    } // k1 loop
+  } // j loop
+  SYNC_CORES(threading)
+  END_NO_HYPERTHREADS(threading)
+}
+
+// used by block_gram_schmidt_PRECISION
+static inline void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B,
+    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
+
+  START_NO_HYPERTHREADS(threading)
+
+  // we compute S = U^dagger B
+  // U has 16 columns, B has num_vec <= 16 columns
+  // for vectorization it is more efficient to transpose the MM product:
+  // S^T = B^T U^*
+
+  PRECISION *Up;
+  PRECISION *Bp;
+
+  // factor 2 is for counting spin01 and spin23 aggregates separately
+  int num_aggregates = 2*l->s_PRECISION.num_aggregates;
+  int aggregate_size = l->inner_vector_size / num_aggregates;
+  int offset = l->num_lattice_site_var/2;
+
+  for ( int jp=threading->core; jp<num_aggregates; jp+=threading->n_core ) {
+    int j = jp/2;
+    int component = jp%2;
+    // factors 2 are for complex and spin01/23 aggregates
+    Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
+    Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
+    mm_PRECISION U_re;
+    mm_PRECISION U_im;
+    mm_PRECISION B_re;
+    mm_PRECISION B_im;
+    mm_PRECISION S_re[SIMD_LENGTH_PRECISION];
+    mm_PRECISION S_im[SIMD_LENGTH_PRECISION];
+    for( int i=0; i<SIMD_LENGTH_PRECISION; i++) {
+      S_re[i] = mm_setzero_PRECISION();
+      S_im[i] = mm_setzero_PRECISION();
+    }
+    for ( int i=0; i<aggregate_size; i+=offset ) {
+      for ( int k=0; k<offset; k++ ) {
+        U_re = mm_load_PRECISION(Up);
+        U_im = mm_load_PRECISION(Up + leading_dimension);
+        for ( int vec=0; vec<num_vec; vec++ ) {
+          B_re = mm_set1_PRECISION(Bp[vec]);
+          B_im = mm_set1_PRECISION(Bp[vec + leading_dimension]);
+          cfmadd_conj_PRECISION(U_re, U_im, B_re, B_im, S_re + vec, S_im + vec);
+        }
+        Bp += 2*leading_dimension;
+        Up += 2*leading_dimension;
+      }
+      Bp += 2*leading_dimension*offset;
+      Up += 2*leading_dimension*offset;
+    }
+    for( int i=0; i<SIMD_LENGTH_PRECISION; i++) {
+      mm_store_PRECISION(S+(2*(SIMD_LENGTH_PRECISION*jp+i)+0)*SIMD_LENGTH_PRECISION, S_re[i]);
+      mm_store_PRECISION(S+(2*(SIMD_LENGTH_PRECISION*jp+i)+1)*SIMD_LENGTH_PRECISION, S_im[i]);
+    }
+    // this stored S^T in row-major format == S in column major
+  }
+
+  END_NO_HYPERTHREADS(threading)
+}
+
+// used by block_gram_schmidt_PRECISION
+static inline void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S,
+    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
+
+  START_NO_HYPERTHREADS(threading)
+
+  // we compute B -= U S
+  // U has 16 columns, B has num_vec <= 16
+
+  PRECISION *Up;
+  PRECISION *Bp;
+
+  // factor 2 is for counting spin01 and spin23 aggregates separately
+  int num_aggregates = 2*l->s_PRECISION.num_aggregates;
+  int aggregate_size = l->inner_vector_size / num_aggregates;
+  int offset = l->num_lattice_site_var/2;
+
+  for ( int jp=threading->core; jp<num_aggregates; jp+=threading->n_core ) {
+    int j = jp/2;
+    int component = jp%2;
+    // factors 2 are for complex and spin01/23 aggregates
+    Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
+    Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
+    mm_PRECISION U_re;
+    mm_PRECISION U_im;
+    mm_PRECISION B_re;
+    mm_PRECISION B_im;
+    mm_PRECISION S_re[SIMD_LENGTH_PRECISION];
+    mm_PRECISION S_im[SIMD_LENGTH_PRECISION];
+    for( int i=0; i<SIMD_LENGTH_PRECISION; i++) {
+      S_re[i] = mm_load_PRECISION(S+(2*(SIMD_LENGTH_PRECISION*jp+i)+0)*SIMD_LENGTH_PRECISION);
+      S_im[i] = mm_load_PRECISION(S+(2*(SIMD_LENGTH_PRECISION*jp+i)+1)*SIMD_LENGTH_PRECISION);
+    }
+    for ( int i=0; i<aggregate_size; i+=offset ) {
+      for ( int k=0; k<offset; k++ ) {
+        U_re = mm_load_PRECISION(Up);
+        U_im = mm_load_PRECISION(Up + leading_dimension);
+        for ( int vec=0; vec<num_vec; vec++ ) {
+          cmul(U_re, U_im, S_re[vec], S_im[vec], &B_re, &B_im);
+                    
+          // horizontal add and subtract from Bp
+          PRECISION B_re_hsum = mm_reduce_add_PRECISION(B_re);
+          PRECISION B_im_hsum = mm_reduce_add_PRECISION(B_im);
+
+          Bp[vec] = B_re_hsum - Bp[vec];
+          Bp[vec + leading_dimension] = B_im_hsum - Bp[vec + leading_dimension];
+        }
+        Bp += 2*leading_dimension;
+        Up += 2*leading_dimension;
+      }
+      Bp += 2*leading_dimension*offset;
+      Up += 2*leading_dimension*offset;
+    }
+  }
+
+  END_NO_HYPERTHREADS(threading)
+}
+
+// used by block_gram_schmidt_PRECISION
+static inline void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, int num_vec, level_struct *l, struct Thread *threading ) {
+  START_NO_HYPERTHREADS(threading)
+
+  PRECISION *S = NULL;
+  START_LOCKED_MASTER(threading)
+  // factors 2 are for complex and spin01/23 aggregates
+  MALLOC_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64);
+  ((PRECISION **)threading->workspace)[0] = S;
+  END_LOCKED_MASTER(threading)
+  S = ((PRECISION **)threading->workspace)[0];
+
+  aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading);
+  aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading);
+
+  START_LOCKED_MASTER(threading)
+  FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION);
+  END_LOCKED_MASTER(threading)
+
+  END_NO_HYPERTHREADS(threading)
+}
+
+static inline void block_gram_schmidt_PRECISION( complex_PRECISION *V, int num_vec, level_struct *l, 
+                                                 struct Thread *threading ) {
+  SYNC_CORES(threading);
+  for ( int i=0; i<num_vec; i+=SIMD_LENGTH_PRECISION ) {
+    int vecs = SIMD_LENGTH_PRECISION;
+    if(num_vec-i < SIMD_LENGTH_PRECISION)
+      vecs = num_vec-i;
+    
+    for ( int j=0; j<i; j+=SIMD_LENGTH_PRECISION )
+      aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( (PRECISION *)(V + i*l->vector_size),
+                                                                     (PRECISION *)(V + j*l->vector_size), vecs,
+                                                                     l, threading );
+    aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading );
+  }
+  SYNC_CORES(threading);
+}
+
+#endif //OPTIMIZED_INTERPOLATION_SETUP_PRECISION
+#endif
diff --git a/src/init.c b/src/init.c
index e40d771..6295390 100644
--- a/src/init.c
+++ b/src/init.c
@@ -44,19 +44,13 @@ void next_level_setup( vector_double *V, level_struct *l, struct Thread *threadi
     // define next level parameters
     l->next_level->level = l->level-1;
     l->next_level->depth = l->depth+1;
-    l->next_level->real_shift = l->real_shift;
-    l->next_level->dirac_shift = l->dirac_shift;
-#ifdef HAVE_TM
-    l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth];
-    l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth];
-    l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth]; 
-#endif
     l->next_level->tol = l->tol;
     l->next_level->post_smooth_iter = g.post_smooth_iter[l->depth+1];
     l->next_level->relax_fac = g.relax_fac[l->depth+1];
     l->next_level->block_iter = g.block_iter[l->depth+1];
     l->next_level->setup_iter = g.setup_iter[l->depth+1];
     l->next_level->num_eig_vect = l->level==1?l->num_eig_vect:g.num_eig_vect[l->depth+1];
+    l->next_level->num_parent_eig_vect = l->num_eig_vect;
     l->next_level->num_lattice_site_var = 2 * l->num_eig_vect;
     l->next_level->n_cy = g.ncycle[l->depth+1];
     l->next_level->global_lattice = g.global_lattice[l->depth+1];
@@ -118,7 +112,9 @@ void next_level_setup( vector_double *V, level_struct *l, struct Thread *threadi
     }
   }
   
+  START_LOCKED_MASTER(threading)
   if ( l->depth == 0 ) printf0("\ninitial coarse grid correction is defined\n");
+  END_LOCKED_MASTER(threading)
 }
 
 
@@ -142,8 +138,8 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
   double t0=0, t1=0;
   
   START_LOCKED_MASTER(threading)
+  g.in_setup = 1;
   if ( g.vt.evaluation ) {
-    l->dirac_shift = l->real_shift;
     l->level = g.num_levels-1;
   }
   
@@ -160,9 +156,14 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
                               g.tol, _RIGHT, vcycle_float, &(g.p_MP), l );
       g.p.op = &(g.op_double);
 #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS))
+#ifdef HAVE_TM1p1
+      MALLOC( g.p.b, complex_double, 2*l->inner_vector_size );
+      MALLOC( g.p.x, complex_double, 2*l->inner_vector_size );
+#else
       MALLOC( g.p.b, complex_double, l->inner_vector_size );
       MALLOC( g.p.x, complex_double, l->inner_vector_size );
 #endif
+#endif
 #ifdef INIT_ONE_PREC
     } else {
 #endif
@@ -181,9 +182,14 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
                               g.tol, _NOTHING, NULL, &(g.p_MP), l );
       g.p.op = &(g.op_double);
 #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS))
+#ifdef HAVE_TM1p1
+      MALLOC( g.p.b, complex_double, 2*l->inner_vector_size );
+      MALLOC( g.p.x, complex_double, 2*l->inner_vector_size );
+#else
       MALLOC( g.p.b, complex_double, l->inner_vector_size );
       MALLOC( g.p.x, complex_double, l->inner_vector_size );
 #endif
+#endif
 #ifdef INIT_ONE_PREC
     } else {
 #endif
@@ -246,19 +252,26 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
     }
     if ( g.method >=0  )
       printf0("|          restart length: %-3d                             |\n", g.restart );
-    printf0("|                      m0: %+9.6lf                       |\n", creal(l->dirac_shift) );
-    if(g.setup_m0!=l->dirac_shift)
+    printf0("|                      m0: %+9.6lf                       |\n", g.m0 );
+    if(g.setup_m0!=g.m0)
       printf0("|                setup m0: %+9.6lf                       |\n", g.setup_m0 );
     printf0("|                     csw: %+9.6lf                       |\n", g.csw );
 #ifdef HAVE_TM
-    printf0("|                      mu: %+9.6lf                       |\n", g.tm_mu);
-    if(g.setup_tm_mu!=g.tm_mu)
-      printf0("|                setup mu: %+9.6lf                       |\n", g.setup_tm_mu );
-    if(g.tm_mu_odd_shift!=0.)
-      printf0("|         mu on odd sites: %+9.6lf                       |\n", g.tm_mu + g.tm_mu_odd_shift );
-    if(g.tm_mu_even_shift!=0.)
-      printf0("|        mu on even sites: %+9.6lf                       |\n", g.tm_mu + g.tm_mu_even_shift );
-    
+    printf0("|                      mu: %+9.6lf                       |\n", g.mu);
+    if(g.setup_mu!=g.mu)
+      printf0("|                setup mu: %+9.6lf                       |\n", g.setup_mu );
+    if(g.mu_odd_shift!=0.)
+      printf0("|         mu on odd sites: %+9.6lf                       |\n", g.mu + g.mu_odd_shift );
+    if(g.mu_even_shift!=0.)
+      printf0("|        mu on even sites: %+9.6lf                       |\n", g.mu + g.mu_even_shift );
+#endif
+#ifdef HAVE_TM1p1
+    if(g.epsbar)
+      printf0("|                  epsbar: %+9.6lf                       |\n", g.epsbar);
+    if(g.epsbar_ig5_odd_shift!=0.)
+      printf0("|    ig5 epsbar odd sites: %+9.6lf                       |\n", g.epsbar_ig5_odd_shift );
+    if(g.epsbar_ig5_even_shift!=0.)
+      printf0("|   ig5 epsbar even sites: %+9.6lf                       |\n", g.epsbar_ig5_even_shift );
 #endif
     if ( g.method > 0 ) {
       printf0("+----------------------------------------------------------+\n");
@@ -285,13 +298,20 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
           printf0("|               tolerance: %-5.0le                           |\n", g.coarse_tol );
         }
 #ifdef HAVE_TM
-	if( g.tm_mu!=0. && g.tm_mu_factor[i]!=1 )
-	  printf0("|                      mu: %+9.6lf                       |\n", g.tm_mu * g.tm_mu_factor[i]);
-	if( g.tm_mu_odd_shift!=0. && g.tm_mu_factor[i]!=1 )
-	  printf0("|         mu on odd sites: %+9.6lf                       |\n", (g.tm_mu + g.tm_mu_odd_shift) * g.tm_mu_factor[i] );
-	if( g.tm_mu_even_shift!=0. && g.tm_mu_factor[i]!=1 )
-	  printf0("|        mu on even sites: %+9.6lf                       |\n", (g.tm_mu + g.tm_mu_even_shift) * g.tm_mu_factor[i] );
-    
+        if( g.mu!=0. && g.mu_factor[i]!=1 )
+          printf0("|                      mu: %+9.6lf                       |\n", g.mu * g.mu_factor[i] );
+        if( g.mu_odd_shift!=0. && g.mu_factor[i]!=1 )
+          printf0("|         mu on odd sites: %+9.6lf                       |\n", (g.mu + g.mu_odd_shift) * g.mu_factor[i] );
+        if( g.mu_even_shift!=0. && g.mu_factor[i]!=1 )
+          printf0("|        mu on even sites: %+9.6lf                       |\n", (g.mu + g.mu_even_shift) * g.mu_factor[i] );
+#endif
+#ifdef HAVE_TM1p1
+        if( g.epsbar!=0. && g.epsbar_factor[i]!=1 )
+          printf0("|                 epsbar: %+9.6lf                       |\n", g.epsbar * g.epsbar_factor[i] );
+        if(g.epsbar_ig5_odd_shift!=0. && g.epsbar_factor[i]!=1)
+          printf0("|  ig5 epsbar on odd sites: %+9.6lf                     |\n", (g.epsbar + g.epsbar_ig5_odd_shift) * g.epsbar_factor[i] );
+        if(g.epsbar_ig5_even_shift!=0. && g.epsbar_factor[i]!=1)
+          printf0("| ig5 epsbar on even sites: %+9.6lf                     |\n", (g.epsbar + g.epsbar_ig5_even_shift) *g.epsbar_factor[i] );
 #endif
       }
     }
@@ -305,15 +325,17 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading )
     printf0("\n");
   }
 #endif
+  g.in_setup = 0;
   END_LOCKED_MASTER(threading)  
+
+  START_LOCKED_MASTER(threading)
+  if ( l->depth==0 && g.method >=0 )
+    prof_print( l );
+  END_LOCKED_MASTER(threading)
   
 #ifdef DEBUG
   test_routine( l, threading );
 #endif
-  START_LOCKED_MASTER(threading)
-  if ( l->depth==0 )
-    prof_print( l );
-  END_LOCKED_MASTER(threading)
 }
 
 
@@ -341,9 +363,14 @@ void method_free( level_struct *l ) {
 #endif
     fgmres_MP_struct_free( &(g.p_MP) );
 #if defined (INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS))
+#ifdef HAVE_TM1p1
+    FREE( g.p.b, complex_double, 2*l->inner_vector_size );
+    FREE( g.p.x, complex_double, 2*l->inner_vector_size );
+#else
     FREE( g.p.b, complex_double, l->inner_vector_size );
     FREE( g.p.x, complex_double, l->inner_vector_size );
 #endif
+#endif
 #ifdef INIT_ONE_PREC
   } else {
 #endif
@@ -368,28 +395,28 @@ void method_update( int setup_iter, level_struct *l, struct Thread *threading )
   
   if ( g.method > 0 && g.interpolation && g.num_levels > 1 && setup_iter > 0 ) {
     
-    double t0=0, t1=0, shift = creal(l->dirac_shift);
+    double t0=0, t1=0;
     
     START_LOCKED_MASTER(threading)
     g.in_setup = 1;
-      if ( l->depth==0 )
-        prof_init( l );
+    if ( l->depth==0 )
+      prof_init( l );
     END_LOCKED_MASTER(threading)
 
-    START_MASTER(threading)
-    t0 = MPI_Wtime();
-    END_MASTER(threading)
+    MASTER(threading)
+      t0 = MPI_Wtime();
     
-#ifndef HAVE_TM
-    if ( g.setup_m0 != shift )
-      optimized_shift_update( (complex_double)g.setup_m0, l, threading );
-#else
-    double tm_shift = g.tm_mu;
-    if ( g.setup_tm_mu != tm_shift || g.setup_m0 != shift ) {
-      g.tm_mu = g.setup_tm_mu;
-      optimized_shift_update( (complex_double)g.setup_m0, l, threading );
+    if ( g.setup_m0 != g.m0 ) {
+      m0_update( (complex_double)g.setup_m0, l, threading );
+#ifdef HAVE_TM
     }
+    if ( g.setup_mu != g.mu ) {
+      tm_term_update( (complex_double)g.setup_mu, l, threading );
+      finalize_operator_update( l, threading );
+    } else if (g.setup_m0 != g.m0) {
 #endif
+      finalize_operator_update( l, threading );
+    }
     
     if ( g.mixed_precision )
       iterative_float_setup( setup_iter, l, threading );
@@ -397,34 +424,35 @@ void method_update( int setup_iter, level_struct *l, struct Thread *threading )
       iterative_double_setup( setup_iter, l, threading );
 
     
-#ifndef HAVE_TM
-    if ( g.setup_m0 != shift )
-      optimized_shift_update( (complex_double)shift, l, threading );
-#else
-    if ( g.setup_tm_mu != tm_shift || g.setup_m0 != shift ) {
-      g.tm_mu = tm_shift;
-      optimized_shift_update( (complex_double) shift, l, threading );
+    if ( g.setup_m0 != g.m0 ) {
+      m0_update( (complex_double)g.m0, l, threading );
+#ifdef HAVE_TM
     }
+    if ( g.setup_mu != g.mu ) {
+      tm_term_update( (complex_double)g.mu, l, threading );
+      finalize_operator_update( l, threading );
+    } else if (g.setup_m0 != g.m0) {
 #endif
+      finalize_operator_update( l, threading );
+    }
 
-    START_MASTER(threading)
-    t1 = MPI_Wtime();
-    g.total_time = t1-t0;
-    printf0("\nperformed %d iterative setup steps\n", setup_iter );
-    printf0("elapsed time: %lf seconds (%lf seconds on coarse grid)\n\n", t1-t0, g.coarse_time );
-    END_MASTER(threading)
+    MASTER(threading) {
+      t1 = MPI_Wtime();
+      g.total_time = t1-t0;
+      printf0("\nperformed %d iterative setup steps\n", setup_iter );
+      printf0("elapsed time: %lf seconds (%lf seconds on coarse grid)\n\n", t1-t0, g.coarse_time );
+    }
     
-#ifdef DEBUG
-    test_routine( l, threading );
-#endif
-
     START_LOCKED_MASTER(threading)
     g.in_setup = 0;
     if ( l->depth==0 )
       prof_print( l );
     END_LOCKED_MASTER(threading)
-    
-    
+      
+#ifdef DEBUG
+    test_routine( l, threading );
+#endif
+
   }
 }
 
@@ -487,7 +515,10 @@ void method_finalize( level_struct *l ) {
   FREE( g.ncycle, int, ls );
   FREE( g.relax_fac, double, ls );
 #ifdef HAVE_TM
-  FREE( g.tm_mu_factor, double, ls );
+  FREE( g.mu_factor, double, ls );
+#endif
+#ifdef HAVE_TM1p1
+  FREE( g.epsbar_factor, double, ls );
 #endif
   FREE( g.block_iter, int, ls );
   FREE( g.setup_iter, int, ls );
@@ -529,7 +560,7 @@ int read_parameter( void **save_at, char *search_pattern, char *read_format, int
   
   if ( read_from == NULL ) {
     if ( !set_default )
-      error0("unable to find string \"%s\" --- fatal error\n", search_pattern);
+      error0("FILE NULL, unable to find string \"%s\" --- fatal error\n", search_pattern);
     else
       return match;
   }
@@ -539,6 +570,7 @@ int read_parameter( void **save_at, char *search_pattern, char *read_format, int
   while ( !match && fgets( read_pattern, 100000, read_from ) ) {
   
     k = strlen( read_pattern );
+    /*
     j = 0;
     for ( i=0; i<k && !match; i++ ) {
       if ( search_pattern[j] == read_pattern[i] )
@@ -549,6 +581,16 @@ int read_parameter( void **save_at, char *search_pattern, char *read_format, int
         match = 1;
       }
     }
+    */ // replace it with a search just at the beginning of the line.
+    if(k>n) {
+      match = 1;
+      i = 0;
+      while ( i<n && match ) { 
+        if ( search_pattern[i] != read_pattern[i] )
+          match = 0;
+        i++;
+      }
+    }
   }
   
   read_pattern_pt = read_pattern+i;
@@ -626,7 +668,11 @@ void g_init( level_struct *l ) {
   g.setup_iter = NULL;
   g.relax_fac = NULL;
 #ifdef HAVE_TM
-  g.tm_mu_factor = NULL;
+  g.mu_factor = NULL;
+#endif
+#ifdef HAVE_TM1p1
+  g.epsbar_factor = NULL;
+  g.n_flavours = 1;
 #endif
   g.gamma = NULL;
   g.odd_even_table = NULL;
@@ -635,18 +681,6 @@ void g_init( level_struct *l ) {
   g.in_setup = 0;
 }
 
-void parameter_update( level_struct *l ) {
-  
-  l->level = g.num_levels-1-l->depth;
-  l->post_smooth_iter = g.post_smooth_iter[l->depth];
-  l->block_iter = g.block_iter[l->depth];
-  l->setup_iter = g.setup_iter[l->depth];
-  l->num_eig_vect = g.num_eig_vect[l->depth];
-  
-  if ( l->level > 0 ) 
-    parameter_update( l->next_level );
-}
-
 void read_global_info( FILE *in ) {
 
   void *save_pt;
@@ -706,26 +740,30 @@ void read_no_default_info( FILE *in, level_struct *l ) {
   read_parameter( &save_pt, "d0 block lattice:", "%d", 4, in, _NO_DEFAULT_SET );
 
     // Wilson mass
-  save_pt = &(l->real_shift); l->real_shift = 0;
+  save_pt = &(g.m0); g.m0 = 0;
   read_parameter( &save_pt, "m0:", "%lf", 1, in, _DEFAULT_SET ); 
-  if ( l->real_shift == 0 ) {
+  if ( g.m0 == 0 ) {
     double kappa=0; save_pt = &(kappa);    
     read_parameter( &save_pt, "kappa:", "%lf", 1, in, _DEFAULT_SET );
     ASSERT(kappa != 0);
-    l->real_shift = 1./(2.*kappa)-4.; //setting m0 from kappa
+    g.m0 = 1./(2.*kappa)-4.; //setting m0 from kappa
   }
   save_pt = &(g.csw);
   read_parameter( &save_pt, "csw:", "%lf", 1, in, _NO_DEFAULT_SET );
   
 #ifdef HAVE_TM
-  save_pt = &(g.tm_mu);g.tm_mu=0;
+  save_pt = &(g.mu);g.mu=0;
   read_parameter( &save_pt, "mu:", "%lf", 1, in, _DEFAULT_SET );
-  if ( g.tm_mu == 0 ) {
+  if ( g.mu == 0 ) {
     read_parameter( &save_pt, "2KappaMu:", "%lf", 1, in, _DEFAULT_SET );
-    g.tm_mu = g.tm_mu*(4.+l->real_shift);
+    g.mu = g.mu*(4.+g.m0);
   }
 #endif
 
+#ifdef HAVE_TM1p1
+  save_pt = &(g.epsbar); g.epsbar = 0;
+  read_parameter( &save_pt, "epsbar:", "%lf", 1, in, _DEFAULT_SET );
+#endif
 }
 
 void set_global_info( struct init *params, level_struct *l ) {
@@ -747,10 +785,10 @@ void set_global_info( struct init *params, level_struct *l ) {
   }
   
   // Operator
-  l->real_shift = 1./(2.*params->kappa)-4.;
+  g.m0 = 1./(2.*params->kappa)-4.;
   g.csw = params->csw;
 #ifdef HAVE_TM
-  g.tm_mu = params->mu;
+  g.mu = params->mu;
 #endif
   
   g.num_openmp_processes = params->number_openmp_threads;
@@ -781,23 +819,23 @@ void read_geometry_data( FILE *in, int ls ) {
   int i, mu, nb, nls, nlls, flag;
   
   for ( i=0; i<ls; i++ ) {
-
+    
     if(i>0) {
       // global lattice
       sprintf( inputstr, "d%d global lattice:", i );
       save_pt = g.global_lattice[i];
       
       if ( ! read_parameter( &save_pt, inputstr, "%d", 4, in, _DEFAULT_SET ) ) {
-	nls = 1;
-	for ( mu=0; mu<4; mu++ ) {
-	  g.global_lattice[i][mu] = g.global_lattice[i-1][mu]/g.block_lattice[i-1][mu];
-	  nls *= g.global_lattice[i][mu];
-	}
-	if ( g.odd_even && nls < 2 ) {
-	  warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i );
-	  g.num_levels = i; ls = i;
-	  break;
-	}
+        nls = 1;
+        for ( mu=0; mu<4; mu++ ) {
+          g.global_lattice[i][mu] = g.global_lattice[i-1][mu]/g.block_lattice[i-1][mu];
+          nls *= g.global_lattice[i][mu];
+        }
+        if ( g.odd_even && nls < 2 ) {
+          warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i );
+          g.num_levels = i; ls = i;
+          break;
+        }
       }
       
       // local lattice
@@ -805,84 +843,84 @@ void read_geometry_data( FILE *in, int ls ) {
       save_pt = g.local_lattice[i];
       
       if ( ! read_parameter( &save_pt, inputstr, "%d", 4, in, _DEFAULT_SET ) ) {
-	nls = 1;
-	nlls = 1;
-	for ( mu=0; mu<4; mu++ ) {
-	  g.local_lattice[i][mu] = g.local_lattice[i-1][mu]/g.block_lattice[i-1][mu];
-	  nlls *= g.local_lattice[i][mu];
-	  nls *= g.global_lattice[i][mu];
-	}
-	if ( g.odd_even && nlls < 2 ) {
-	  if ( nls/nlls > 1 ) {
-	    mu = shortest_dir( g.local_lattice[i] );
-	    if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) {
-	      g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu],
-					     g.global_lattice[i][mu]/g.local_lattice[i][mu] );
-	    }
-	  }
-	}
+        nls = 1;
+        nlls = 1;
+        for ( mu=0; mu<4; mu++ ) {
+          g.local_lattice[i][mu] = g.local_lattice[i-1][mu]/g.block_lattice[i-1][mu];
+          nlls *= g.local_lattice[i][mu];
+          nls *= g.global_lattice[i][mu];
+        }
+        if ( g.odd_even && nlls < 2 ) {
+          if ( nls/nlls > 1 ) {
+            mu = shortest_dir( g.local_lattice[i] );
+            if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) {
+              g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu],
+                                             g.global_lattice[i][mu]/g.local_lattice[i][mu] );
+            }
+          }
+        }
       }
       
       // block lattice
       for ( mu=0; mu<4; mu++ )
-	g.block_lattice[i][mu] = 1;
+        g.block_lattice[i][mu] = 1;
       if ( i<ls-1 ) {
-	sprintf( inputstr, "d%d block lattice:", i );
-	save_pt = g.block_lattice[i];
-	if ( ! read_parameter( &save_pt, inputstr, "%d", 4, in, _DEFAULT_SET ) ) {
-	  nls = 1;
-	  nb = 1;
-	  flag = 1;
-	  for ( mu=0; mu<4; mu++ )  {
-	    if ( DIVIDES( 2, g.global_lattice[i][mu] ) ) {
-	      g.block_lattice[i][mu] = 2;
-	    } else if ( DIVIDES( 3, g.global_lattice[i][mu] ) ) {
-	      g.block_lattice[i][mu] = 3;
-	    } else {
-	      warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i+1 );
-	      g.num_levels = i+1; ls=i+1;
-	      g.block_lattice[i][mu] = 1;
-	      flag = 0;
-	      break;
-	    }
-	    nb *= g.local_lattice[i][mu]/g.block_lattice[i][mu];
-	    
-	    if ( g.local_lattice[i][mu] < g.block_lattice[i][mu] ) {
-	      g.local_lattice[i][mu] *= g.block_lattice[i][mu];
-	      if ( ! DIVIDES( g.local_lattice[i][mu], g.global_lattice[i][mu] ) ) {
-		g.local_lattice[i][mu] /= g.block_lattice[i][mu];
-	      }
-	      warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i+1 );
-	      g.num_levels = i+1; ls=i+1;
-	      g.block_lattice[i][mu] = 1;
-	      flag = 0;
-	      break;
+        sprintf( inputstr, "d%d block lattice:", i );
+        save_pt = g.block_lattice[i];
+        if ( ! read_parameter( &save_pt, inputstr, "%d", 4, in, _DEFAULT_SET ) ) {
+          nls = 1;
+          nb = 1;
+          flag = 1;
+          for ( mu=0; mu<4; mu++ )  {
+            if ( DIVIDES( 2, g.global_lattice[i][mu] ) ) {
+              g.block_lattice[i][mu] = 2;
+            } else if ( DIVIDES( 3, g.global_lattice[i][mu] ) ) {
+              g.block_lattice[i][mu] = 3;
+            } else {
+              warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i+1 );
+              g.num_levels = i+1; ls=i+1;
+              g.block_lattice[i][mu] = 1;
+              flag = 0;
+              break;
             }
-	  }
-	  
-	  if ( flag == 1 && g.method == 2 && nb == 1 ) {
-	    mu = shortest_dir( g.local_lattice[i] );
-	    if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) {
-	      g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu],
-					     g.global_lattice[i][mu]/g.local_lattice[i][mu] );
-	    }
-	  }
-	  
-	}
+            nb *= g.local_lattice[i][mu]/g.block_lattice[i][mu];
+            
+            if ( g.local_lattice[i][mu] < g.block_lattice[i][mu] ) {
+              g.local_lattice[i][mu] *= g.block_lattice[i][mu];
+              if ( ! DIVIDES( g.local_lattice[i][mu], g.global_lattice[i][mu] ) ) {
+                g.local_lattice[i][mu] /= g.block_lattice[i][mu];
+              }
+              warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i+1 );
+              g.num_levels = i+1; ls=i+1;
+              g.block_lattice[i][mu] = 1;
+              flag = 0;
+              break;
+            }
+          }
+          
+          if ( flag == 1 && g.method == 2 && nb == 1 ) {
+            mu = shortest_dir( g.local_lattice[i] );
+            if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) {
+              g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu],
+                                             g.global_lattice[i][mu]/g.local_lattice[i][mu] );
+            }
+          }
+          
+        }
       }
     }
 #ifdef DEBUG
     printf00("level: %d, gl: %3d %3d %3d %3d\n", i, g.global_lattice[i][0],
-            g.global_lattice[i][1],g.global_lattice[i][2],g.global_lattice[i][3] );
+             g.global_lattice[i][1],g.global_lattice[i][2],g.global_lattice[i][3] );
     
     printf00("level: %d, ll: %3d %3d %3d %3d\n", i, g.local_lattice[i][0],
-            g.local_lattice[i][1],g.local_lattice[i][2],g.local_lattice[i][3] );
-            
+             g.local_lattice[i][1],g.local_lattice[i][2],g.local_lattice[i][3] );
+    
     printf00("level: %d, bl: %3d %3d %3d %3d\n\n", i, g.block_lattice[i][0],
-            g.block_lattice[i][1],g.block_lattice[i][2],g.block_lattice[i][3] );
+             g.block_lattice[i][1],g.block_lattice[i][2],g.block_lattice[i][3] );
 #endif
-            
-            
+    
+    
     sprintf( inputstr, "d%d post smooth iter:", i );
     save_pt = &(g.post_smooth_iter[i]); g.post_smooth_iter[i] = 4;
     read_parameter( &save_pt, inputstr, "%d", 1, in, _DEFAULT_SET );
@@ -910,7 +948,13 @@ void read_geometry_data( FILE *in, int ls ) {
     
 #ifdef HAVE_TM
     sprintf( inputstr, "d%d mu factor:", i );
-    save_pt = &(g.tm_mu_factor[i]); g.tm_mu_factor[i] = 1;
+    save_pt = &(g.mu_factor[i]); g.mu_factor[i] = 1;
+    read_parameter( &save_pt, inputstr, "%lf", 1, in, _DEFAULT_SET );
+#endif
+
+#ifdef HAVE_TM1p1
+    sprintf( inputstr, "d%d epsbar factor:", i );
+    save_pt = &(g.epsbar_factor[i]); g.epsbar_factor[i] = 1;
     read_parameter( &save_pt, inputstr, "%lf", 1, in, _DEFAULT_SET );
 #endif
     
@@ -945,16 +989,23 @@ void read_solver_parameters( FILE *in, level_struct *l ) {
   save_pt = &(g.odd_even); g.odd_even = 1;
   read_parameter( &save_pt, "odd even preconditioning:", "%d", 1, in, _DEFAULT_SET );
 
-  save_pt = &(g.setup_m0); g.setup_m0=l->real_shift;
+  save_pt = &(g.setup_m0); g.setup_m0 = g.m0;
   read_parameter( &save_pt, "setup m0:", "%lf", 1, in, _DEFAULT_SET );
 #ifdef HAVE_TM
-  save_pt = &(g.tm_mu_odd_shift);g.tm_mu_odd_shift=0;
+  save_pt = &(g.mu_odd_shift); g.mu_odd_shift = 0;
   read_parameter( &save_pt, "mu odd shift:", "%lf", 1, in, _DEFAULT_SET );
-  save_pt = &(g.tm_mu_even_shift);g.tm_mu_even_shift=0;
+  save_pt = &(g.mu_even_shift); g.mu_even_shift = 0;
   read_parameter( &save_pt, "mu even shift:", "%lf", 1, in, _DEFAULT_SET );
-  save_pt = &(g.setup_tm_mu); g.setup_tm_mu=g.tm_mu;
+  save_pt = &(g.setup_mu); g.setup_mu = g.mu;
   read_parameter( &save_pt, "setup mu:", "%lf", 1, in, _DEFAULT_SET );
 #endif
+
+#ifdef HAVE_TM1p1
+  save_pt = &(g.epsbar_ig5_odd_shift);g.epsbar_ig5_odd_shift=0;
+  read_parameter( &save_pt, "epsbar odd shift:", "%lf", 1, in, _DEFAULT_SET );
+  save_pt = &(g.epsbar_ig5_even_shift);g.epsbar_ig5_even_shift=0;
+  read_parameter( &save_pt, "epsbar even shift:", "%lf", 1, in, _DEFAULT_SET );
+#endif
   
   save_pt = &(g.method); g.method = 2;
   read_parameter( &save_pt, "method:", "%d", 1, in, _DEFAULT_SET );
@@ -1033,7 +1084,7 @@ void validate_parameters( int ls, level_struct *l ) {
 
   int i;
   int mu;
-  
+
 #ifdef SSE
   if ( !g.odd_even )
     warning0("The SSE implementation is based on the odd-even preconditioned code.\
@@ -1068,13 +1119,12 @@ void validate_parameters( int ls, level_struct *l ) {
       if ( g.block_lattice[i][mu] != g.global_lattice[i][mu]/g.global_lattice[i+1][mu] )
         warning0("when using SSE, Schwarz block size and aggregate size have to match.\n");
       ASSERT( g.block_lattice[i][mu] == g.global_lattice[i][mu]/g.global_lattice[i+1][mu] );
+      // it works everywhere but we have some problem with the vector size.
+      // TODO: check all vectora allocated with size l->inner_vector_size
+      ASSERT( g.num_eig_vect[i] % SIMD_LENGTH_float == 0 );
 #endif
     }
     
-  for ( i=0; i<g.num_levels-2; i++ )
-    //TODO: Could work without, but you need to fix the setup phase.
-    ASSERT( g.num_eig_vect[i] <= g.num_eig_vect[i+1] );
-  
   if ( g.odd_even ) {
     int coarse_sites_per_core = 1;
     for ( mu=0; mu<4; mu++ ) {
@@ -1108,9 +1158,24 @@ void validate_parameters( int ls, level_struct *l ) {
   ASSERT( IMPLIES( g.kcycle && g.method > 0, g.kcycle_max_restart > 0 ) );
   ASSERT( IMPLIES( g.kcycle && g.method > 0, 0 < g.kcycle_tol && g.kcycle_tol < 1 ) );
   
+
+  //LIST OF CASES WHICH SHOULD WORK, BUT DO NOT (TODO)
+
 #ifdef SSE
   ASSERT( g.mixed_precision );
-//   ASSERT( DIVIDES( 4, g.num_eig_vect[0] ) );
+#endif
+  
+  //TODO: Could work without, but you need to fix the setup phase.    
+  for ( i=0; i<g.num_levels-2; i++ )
+    ASSERT( g.num_eig_vect[i] <= g.num_eig_vect[i+1] );
+
+  //TODO: for some reason g.mixed_precision=0 do not work with g.num_levels>2
+  if ( g.num_levels>2 && g.interpolation )
+    ASSERT( g.mixed_precision );
+
+#ifdef HAVE_TM1p1
+  //TODO: method = 6 not supported with HAVE_TM1p1. To fix all the g5D functions
+  ASSERT( g.method !=6 );
 #endif
 }
 
@@ -1130,7 +1195,10 @@ void allocate_for_global_struct_after_read_global_info( int ls ) {
   MALLOC( g.ncycle, int, ls );
   MALLOC( g.relax_fac, double, ls );
 #ifdef HAVE_TM
-  MALLOC( g.tm_mu_factor, double, ls );
+  MALLOC( g.mu_factor, double, ls );
+#endif
+#ifdef HAVE_TM1p1
+  MALLOC( g.epsbar_factor, double, ls );
 #endif
   MALLOC( g.block_iter, int, ls );
   MALLOC( g.setup_iter, int, ls );
@@ -1157,7 +1225,8 @@ void set_level_and_global_structs_according_to_global_struct( level_struct *l )
   l->block_iter = g.block_iter[0];
   l->setup_iter = g.setup_iter[0];
   l->num_eig_vect = g.num_eig_vect[0];
-  
+  l->num_parent_eig_vect = 6; //for consistency sake
+    
   // compute some additional values
   l->num_lattice_site_var = 12;
   g.num_processes = 1;
@@ -1170,16 +1239,7 @@ void set_level_and_global_structs_according_to_global_struct( level_struct *l )
     g.num_processes *= l->global_splitting[mu];
   }
   
-  l->dirac_shift = l->real_shift;
-#ifdef HAVE_TM
-  l->tm_shift = g.tm_mu;
-  l->tm_even_shift = g.tm_mu_even_shift;
-  l->tm_odd_shift = g.tm_mu_odd_shift; 
-#endif
-  l->even_shift = l->dirac_shift;
-  l->odd_shift = l->dirac_shift;
-  g.solve_m0 = l->dirac_shift;
-  g.setup_m0 = l->dirac_shift;
+  g.setup_m0 = g.m0;
 }
 
 void lg_in( char *inputfile, level_struct *l ) {
@@ -1214,6 +1274,29 @@ void lg_in( char *inputfile, level_struct *l ) {
   fclose(in);
 }
 
+void parameter_update( level_struct *l ) {
+  
+  if(l->depth==0) {
+    int ls = MAX(g.num_levels,2);
+    set_level_and_global_structs_according_to_global_struct( l );
+    validate_parameters( ls, l );
+  }
+
+  l->level = g.num_levels-1-l->depth;
+  l->post_smooth_iter = g.post_smooth_iter[l->depth];
+  l->block_iter = g.block_iter[l->depth];
+  l->setup_iter = g.setup_iter[l->depth];
+  l->num_eig_vect = g.num_eig_vect[l->depth];
+  if(l->depth>0)
+    l->num_parent_eig_vect = g.num_eig_vect[l->depth-1];
+  else
+    l->num_parent_eig_vect = 6;
+  
+  if ( l->level > 0 && l->next_level != NULL ) 
+    parameter_update( l->next_level );
+}
+
+
 void set_DDalphaAMG_parameters( struct init *params, level_struct *l ) {
 
   FILE *in=NULL;
diff --git a/src/init_generic.c b/src/init_generic.c
index d14ca5a..7f6b50b 100644
--- a/src/init_generic.c
+++ b/src/init_generic.c
@@ -98,12 +98,19 @@ double prof_PRECISION_print( level_struct *l ) {
 void fine_level_PRECISION_alloc( level_struct *l ) {
   
   int n = 8;
-  
+#ifdef HAVE_TM1p1
+  MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size );  
+  for ( int i=1; i<n; i++ )
+    l->vbuf_PRECISION[i] = l->vbuf_PRECISION[0] + 2*i*l->vector_size;
+  MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size );
+  l->p_PRECISION.x = l->p_PRECISION.b + 2*l->inner_vector_size;
+#else
   MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size );  
   for ( int i=1; i<n; i++ )
     l->vbuf_PRECISION[i] = l->vbuf_PRECISION[0] + i*l->vector_size;
   MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size );
   l->p_PRECISION.x = l->p_PRECISION.b + l->inner_vector_size;
+#endif
 }
 
 
@@ -111,11 +118,19 @@ void fine_level_PRECISION_free( level_struct *l ) {
   
   int n = 8;
   
+#ifdef HAVE_TM1p1
+  FREE( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size );  
+  for ( int i=1; i<n; i++ )
+    l->vbuf_PRECISION[i] = NULL;
+  FREE( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size );
+  l->p_PRECISION.x = NULL;
+#else
   FREE( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size );  
   for ( int i=1; i<n; i++ )
     l->vbuf_PRECISION[i] = NULL;
   FREE( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size );
   l->p_PRECISION.x = NULL;
+#endif
 }
 
 
@@ -143,18 +158,28 @@ void next_level_PRECISION_setup( level_struct *l ) {
                                        g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION,
                                        &(l->next_level->p_PRECISION), l->next_level );
       } else {
+#ifdef HAVE_TM1p1
+        MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size );
+        l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + 2*l->next_level->vector_size;
+#else
         MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size );
         l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + l->next_level->vector_size;
-        l->next_level->p_PRECISION.shift = 0;
+#endif
         l->next_level->p_PRECISION.v_start = 0;
-        l->next_level->p_PRECISION.v_end = l->inner_vector_size;
+        l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size;
       }
     }
 
     int i, n = (l->next_level->level>0)?6:4;
+#ifdef HAVE_TM1p1
+    MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size );
+    for ( i=1; i<n; i++ )
+      l->next_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + 2*i*l->next_level->vector_size;
+#else
     MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size );
     for ( i=1; i<n; i++ )
       l->next_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + i*l->next_level->vector_size;
+#endif
   }
 }
 
@@ -167,13 +192,21 @@ void next_level_PRECISION_free( level_struct *l ) {
     if ( ( l->level == 1 && !l->next_level->idle ) || g.kcycle ) {
       fgmres_PRECISION_struct_free( &(l->next_level->p_PRECISION), l->next_level );
     } else {
+#ifdef HAVE_TM1p1
+      FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size );
+#else
       FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size );
+#endif
     }
   
     int i, n = (l->next_level->level>0)?6:4;  
     for ( i=1; i<n; i++)
       l->next_level->vbuf_PRECISION[i] = NULL;
+#ifdef HAVE_TM1p1
+    FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size );
+#else
     FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size );
+#endif
     coarsening_index_table_PRECISION_free( &(l->is_PRECISION), l );
   }
 
@@ -203,9 +236,7 @@ void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading )
   PUBLIC_MALLOC( v1, complex_PRECISION, l->inner_vector_size );
   PUBLIC_MALLOC( v2, complex_PRECISION, l->inner_vector_size );
 
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l );
-  END_LOCKED_MASTER(threading)
+  vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l, threading );
   
   START_MASTER(threading)
   t0 = MPI_Wtime();
diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c
index 5c78c9f..b6a4436 100644
--- a/src/interpolation_generic.c
+++ b/src/interpolation_generic.c
@@ -21,22 +21,26 @@
 
 #include "main.h"
 
-#if ( !defined( SSE ) || !defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) )
-
 void interpolation_PRECISION_alloc( level_struct *l ) {
   
   int k, n = l->num_eig_vect;
   
   MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
   MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n );
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION  
   MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n );
   l->is_PRECISION.interpolation[0] = NULL;
   MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 64 );
   for ( k=1; k<n; k++ )
     l->is_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size;
   MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size );
+#else
+  // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size
+  MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION,
+                    ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 );
+#endif
   l->is_PRECISION.test_vector[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 64 );
+  MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 );
   for ( k=1; k<n; k++ ) {
     l->is_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size;
   }
@@ -64,9 +68,13 @@ void interpolation_PRECISION_free( level_struct *l ) {
   FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size );
   FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
   FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n );
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION  
   FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size );
   FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n );
   FREE( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size );
+#else
+  FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size );
+#endif
 
 }
 
@@ -80,12 +88,39 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation,
   int end = threading->end_index[l->depth];
       
   SYNC_CORES(threading)
+#ifndef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION
   operator += start*num_eig_vect;
   for ( int i=start; i<end; i++ )
     for ( j=0; j<num_eig_vect; j++ ) {
       *operator = interpolation[j][i];
       operator++;
     }
+#else
+  int offset = SIMD_LENGTH_PRECISION;
+  for ( j=0; j<num_eig_vect; j+=offset ) {
+    int j_end = j+offset;
+    if(j_end > num_eig_vect)
+      j_end = num_eig_vect;
+    
+    operator = l->is_PRECISION.operator + j*l->vector_size + start*offset;
+    
+    for ( int i=start; i<end; i+=offset/2 ) {
+      mm_PRECISION data[offset];
+      for ( int j2=j; j2<j_end; j2++ )
+        data[j2-j] = mm_load_PRECISION((PRECISION *)(interpolation[j2]+i));
+      for ( int j2=j_end; j2<j+offset; j2++ )
+        data[j2-j] = mm_setzero_PRECISION();
+      
+      mm_transpose_PRECISION(data);
+
+      for ( int k=0; k<offset; k++) {
+        mm_store_PRECISION((PRECISION *)operator, data[k]);
+        // operator type is complex, so offset only by SIMD_LENGTH_PRECISION over *two*
+        operator += offset/2;
+      }
+    }
+  }
+#endif
   SYNC_CORES(threading)
 }
 
@@ -93,8 +128,8 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation,
 void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1,
-      num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
+  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
+      num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
   complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
                     *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
                     
@@ -102,64 +137,184 @@ void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_
   vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
   END_LOCKED_MASTER(threading)
   SYNC_HYPERTHREADS(threading)
-  
-  for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-    phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-    phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-    operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
-    for ( k=0; k<aggregate_sites; k++ ) {
-      for ( k1=0; k1<2; k1++ ) {
-        for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-          for ( j=0; j<num_eig_vect; j++ ) {
-            *phi_pt += phi_c_pt[j] * (*operator);
-            operator++;
+
+#ifndef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION
+    int sign = 1;
+#ifdef HAVE_TM1p1
+  if( g.n_flavours==2 )
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
+      phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
+      for ( k=0; k<aggregate_sites; k++ ) {
+        for ( k1=0; k1<2; k1++ ) {
+          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            for ( j=0; j<num_eig_vect; j++ ) {
+              *phi_pt += phi_c_pt[j] * (*operator);
+              operator++;
+            }
+            phi_pt++;
           }
-          phi_pt++;
+          operator -= num_parent_eig_vect*num_eig_vect;
+          phi_c_pt += num_eig_vect;
+          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            for ( j=0; j<num_eig_vect; j++ ) {
+              *phi_pt += phi_c_pt[j] * (*operator);
+              operator++;
+            }
+            phi_pt++;
+          }
+          phi_c_pt -= num_eig_vect;
+          phi_c_pt += sign*2*num_eig_vect; sign*=-1;
         }
-        phi_c_pt += sign*num_eig_vect; sign*=-1;
       }
     }
-  }
-    
-  PROF_PRECISION_STOP( _PR, 1, threading );
+  else
+#endif  
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
+      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+      operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
+      for ( k=0; k<aggregate_sites; k++ ) {
+        for ( k1=0; k1<2; k1++ ) {
+          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            for ( j=0; j<num_eig_vect; j++ ) {
+              *phi_pt += phi_c_pt[j] * (*operator);
+              operator++;
+            }
+            phi_pt++;
+          }
+          phi_c_pt += sign*num_eig_vect; sign*=-1;
+        }
+      }
+    }
+#else
+#ifdef HAVE_TM1p1
+  if( g.n_flavours==2 )  
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      PRECISION tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_PRECISION];
+      PRECISION tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_PRECISION];
+      PRECISION tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_PRECISION];
+      PRECISION tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_PRECISION];
+      mm_PRECISION zero =  mm_setzero_PRECISION();
+      for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
+        mm_store_PRECISION(tmp_phi1_c_re+j, zero);
+        mm_store_PRECISION(tmp_phi1_c_im+j, zero);
+        mm_store_PRECISION(tmp_phi2_c_re+j, zero);
+        mm_store_PRECISION(tmp_phi2_c_im+j, zero);
+      }
+      // copy phi_c into temporary
+      for ( j=0; j<num_eig_vect; j++ ) {
+        tmp_phi1_c_re[j] = creal(phi_c_pt[j]);
+        tmp_phi1_c_im[j] = cimag(phi_c_pt[j]);
+      }
+      for ( j=0; j<num_eig_vect; j++ ) {
+        tmp_phi2_c_re[j] = creal(phi_c_pt[j+num_eig_vect]);
+        tmp_phi2_c_im[j] = cimag(phi_c_pt[j+num_eig_vect]);
+      }
+      for ( j=0; j<num_eig_vect; j++ ) {
+        tmp_phi1_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+2*num_eig_vect]);
+        tmp_phi1_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+2*num_eig_vect]);
+      }
+      for ( j=0; j<num_eig_vect; j++ ) {
+        tmp_phi2_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+3*num_eig_vect]);
+        tmp_phi2_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+3*num_eig_vect]);
+      }
+      
+      int offset = SIMD_LENGTH_PRECISION;
+      // loop over blocks of SIMD_LENGTH_PRECISION vectors
+      for ( j=0; j<num_eig_vect; j+=offset ) {
+        phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
+        operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites;
+        
+        for ( k=0; k<aggregate_sites; k++ ) {
+          // offset used for 2 components of gamma5-symmetry stuff
+          int low_high_offset = 0;
+          for ( k1=0; k1<2; k1++ ) {
+            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+              mm_PRECISION phi1_re, phi1_im, phi2_re, phi2_im;
+              
+              mm_PRECISION operator_re = mm_load_PRECISION((PRECISION *)operator);
+              mm_PRECISION operator_im = mm_load_PRECISION((PRECISION *)operator+offset);
+              mm_PRECISION phi1_c_re = mm_load_PRECISION(tmp_phi1_c_re+j+low_high_offset);
+              mm_PRECISION phi1_c_im = mm_load_PRECISION(tmp_phi1_c_im+j+low_high_offset);
+              mm_PRECISION phi2_c_re = mm_load_PRECISION(tmp_phi2_c_re+j+low_high_offset);
+              mm_PRECISION phi2_c_im = mm_load_PRECISION(tmp_phi2_c_im+j+low_high_offset);
+              
+              cmul_PRECISION(operator_re, operator_im, phi1_c_re, phi1_c_im, &phi1_re, &phi1_im);
+              cmul_PRECISION(operator_re, operator_im, phi2_c_re, phi2_c_im, &phi2_re, &phi2_im);
 
-  SYNC_HYPERTHREADS(threading)
-}
+              // horizontal sum for phi
+              phi_pt[0] += mm_reduce_add_PRECISION( phi1_re ) + I*mm_reduce_add_PRECISION( phi1_im );
+              phi_pt[num_parent_eig_vect] += mm_reduce_add_PRECISION( phi2_re ) + I*mm_reduce_add_PRECISION( phi2_im );
 
+              // skip to next real line of matrix
+              operator += offset;
+              phi_pt++;
+            }
+            phi_pt+=num_parent_eig_vect;
+            low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
+          }
+        }
+      }
+    }
+  else
+#endif  
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
+      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+      PRECISION tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_PRECISION];
+      PRECISION tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_PRECISION];
+      mm_PRECISION zero =  mm_setzero_PRECISION();
+      for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
+        mm_store_PRECISION(tmp_phi_c_re+j, zero);
+        mm_store_PRECISION(tmp_phi_c_im+j, zero);
+      }
+      // copy phi_c into temporary
+      for ( j=0; j<num_eig_vect; j++ ) {
+        tmp_phi_c_re[j] = creal(phi_c_pt[j]);
+        tmp_phi_c_im[j] = cimag(phi_c_pt[j]);
+      }
+      for ( j=0; j<num_eig_vect; j++ ) {
+        tmp_phi_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+num_eig_vect]);
+        tmp_phi_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+num_eig_vect]);
+      }
+      
+      int offset = SIMD_LENGTH_PRECISION;
+      // loop over blocks of SIMD_LENGTH_PRECISION vectors
+      for ( j=0; j<num_eig_vect; j+=offset ) {
+        phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
+        operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
+        
+        for ( k=0; k<aggregate_sites; k++ ) {
+          // offset used for 2 components of gamma5-symmetry stuff
+          int low_high_offset = 0;
+          for ( k1=0; k1<2; k1++ ) {
+            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+              mm_PRECISION phi_re, phi_im;
+              
+              mm_PRECISION operator_re = mm_load_PRECISION((PRECISION *)operator);
+              mm_PRECISION operator_im = mm_load_PRECISION((PRECISION *)operator+offset);
+              mm_PRECISION phi_c_re = mm_load_PRECISION(tmp_phi_c_re+j+low_high_offset);
+              mm_PRECISION phi_c_im = mm_load_PRECISION(tmp_phi_c_im+j+low_high_offset);
+              
+              cmul_PRECISION(operator_re, operator_im, phi_c_re, phi_c_im, &phi_re, &phi_im);
 
-void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-      num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-  
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-  
-  for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-    phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-    phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-    int sign = 1;
-    operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
-    for ( k=0; k<aggregate_sites; k++ ) {
-      for ( k1=0; k1<2; k1++ ) {
-        for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-          *phi_pt = phi_c_pt[0] * (*operator);
-          operator++;
-          for ( j=1; j<num_eig_vect; j++ ) {
-            *phi_pt += phi_c_pt[j] * (*operator);
-            operator++;
+              // horizontal sum for phi
+              *phi_pt += mm_reduce_add_PRECISION( phi_re ) + I*mm_reduce_add_PRECISION( phi_im );
+
+              // skip to next real line of matrix
+              operator += offset;
+              phi_pt++;
+            }
+            low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
           }
-          phi_pt++;
         }
-        phi_c_pt += sign*num_eig_vect; sign*=-1;
       }
     }
-  }
+#endif
+  
   PROF_PRECISION_STOP( _PR, 1, threading );
 
   SYNC_HYPERTHREADS(threading)
@@ -172,32 +327,215 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str
   SYNC_HYPERTHREADS(threading)
 
   PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1,
-      num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
+  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
+      num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
   complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
                     *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
 
-  for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {   
-    phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-    phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-    operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
+#ifndef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION
+  int sign = 1;
+#ifdef HAVE_TM1p1
+  if( g.n_flavours==2 )
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {   
+      phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
+      
+      for ( j=0; j<2*2*num_eig_vect; j++ )
+        phi_c_pt[j] = 0;
+      
+      for ( k=0; k<aggregate_sites; k++ ) {
+        for ( k1=0; k1<2; k1++ ) {
+          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            for ( j=0; j<num_eig_vect; j++ ) {
+              phi_c_pt[j] += *phi_pt * conj_PRECISION(*operator);
+              operator++; 
+            } 
+            phi_pt++;
+          }
+          operator -= num_parent_eig_vect*num_eig_vect; 
+          phi_c_pt += num_eig_vect;
+          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            for ( j=0; j<num_eig_vect; j++ ) {
+              phi_c_pt[j] += *phi_pt * conj_PRECISION(*operator);
+              operator++; 
+            }
+            phi_pt++;
+          }
+          phi_c_pt -= num_eig_vect;
+          phi_c_pt += sign*2*num_eig_vect; sign*=-1;
+        }
+      }
+    }
+  else
+#endif  
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {   
+      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+      operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites;
+      
+      for ( j=0; j<2*num_eig_vect; j++ )
+        phi_c_pt[j] = 0;
+      
+      for ( k=0; k<aggregate_sites; k++ ) {
+        for ( k1=0; k1<2; k1++ ) {
+          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            for ( j=0; j<num_eig_vect; j++ ) {
+              phi_c_pt[j] += *phi_pt * conj_PRECISION(*operator);
+              operator++;
+            }
+            phi_pt++;
+          }
+          phi_c_pt += sign*num_eig_vect; sign*=-1;
+        }
+      }
+    }
+#else
+#ifdef HAVE_TM1p1
+  if( g.n_flavours==2 )
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
+      
+      int offset = SIMD_LENGTH_PRECISION;
+      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect;
+      
+      // loop over blocks of SIMD_LENGTH_PRECISION vectors
+      for ( j=0; j<num_eig_vect; j+=offset ) {
+        phi_pt   = phi + i*2*2*num_parent_eig_vect*aggregate_sites;
+        operator = l->is_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites;
+        
+        // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving
+        // complex components and masking
+        // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator)
+        PRECISION tmp_phi1_c_re[2*offset];
+        PRECISION tmp_phi1_c_im[2*offset];
+        PRECISION tmp_phi2_c_re[2*offset];
+        PRECISION tmp_phi2_c_im[2*offset];
+        mm_PRECISION zero =  mm_setzero_PRECISION();
+        for ( k1=0; k1<2*offset; k1+=offset ) {
+          mm_store_PRECISION(tmp_phi1_c_re+k1, zero);
+          mm_store_PRECISION(tmp_phi1_c_im+k1, zero);
+          mm_store_PRECISION(tmp_phi2_c_re+k1, zero);
+          mm_store_PRECISION(tmp_phi2_c_im+k1, zero);
+        }
+        
+        for ( k=0; k<aggregate_sites; k++ ) {
+          // offset used for 2 components of gamma5-symmetry stuff
+          int low_high_offset = 0;
+          for ( k1=0; k1<2; k1++ ) {
+            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            // phi is the same for all eigenvectors -> broadcast
+              mm_PRECISION phi1_re = mm_set1_PRECISION(((PRECISION *)phi_pt)[0]);
+              mm_PRECISION phi1_im = mm_set1_PRECISION(((PRECISION *)phi_pt)[1]);
+              mm_PRECISION phi2_re = mm_set1_PRECISION(((PRECISION *)phi_pt)[0+2*num_parent_eig_vect]);
+              mm_PRECISION phi2_im = mm_set1_PRECISION(((PRECISION *)phi_pt)[1+2*num_parent_eig_vect]);
 
-    for ( j=0; j<2*num_eig_vect; j++ )
-      phi_c_pt[j] = 0;
-    
-    for ( k=0; k<aggregate_sites; k++ ) {
-      for ( k1=0; k1<2; k1++ ) {
-        for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-          for ( j=0; j<num_eig_vect; j++ ) {
-            phi_c_pt[j] += *phi_pt * conj_PRECISION(*operator);
-            operator++;
+              mm_PRECISION operator_re = mm_load_PRECISION((PRECISION *)operator);
+              mm_PRECISION operator_im = mm_load_PRECISION((PRECISION *)operator+offset);
+              mm_PRECISION phi1_c_re = mm_load_PRECISION(tmp_phi1_c_re+low_high_offset);
+              mm_PRECISION phi1_c_im = mm_load_PRECISION(tmp_phi1_c_im+low_high_offset);
+              mm_PRECISION phi2_c_re = mm_load_PRECISION(tmp_phi2_c_re+low_high_offset);
+              mm_PRECISION phi2_c_im = mm_load_PRECISION(tmp_phi2_c_im+low_high_offset);
+              
+              cfmadd_conj_PRECISION(operator_re, operator_im, phi1_re, phi1_im, &phi1_c_re, &phi1_c_im);
+              cfmadd_conj_PRECISION(operator_re, operator_im, phi2_re, phi2_im, &phi2_c_re, &phi2_c_im);
+              
+              mm_store_PRECISION(tmp_phi1_c_re+low_high_offset, phi1_c_re);
+              mm_store_PRECISION(tmp_phi1_c_im+low_high_offset, phi1_c_im);
+              mm_store_PRECISION(tmp_phi2_c_re+low_high_offset, phi2_c_re);
+              mm_store_PRECISION(tmp_phi2_c_im+low_high_offset, phi2_c_im);
+              // skip to next real line of matrix
+              operator += offset;
+              phi_pt++;
+            }
+            phi_pt += num_parent_eig_vect;
+            low_high_offset = offset;
           }
-          phi_pt++;
         }
-        phi_c_pt += sign*num_eig_vect; sign*=-1;
+        
+        for ( int m=0; m<offset; m++ ) {
+          if ( m+j >= num_eig_vect ) break;
+          ((PRECISION*)(phi_c_pt+j+m))[0] = tmp_phi1_c_re[m];
+          ((PRECISION*)(phi_c_pt+j+m))[1] = tmp_phi1_c_im[m];
+        }
+        for ( int m=0; m<offset; m++ ) {
+          if ( m+j >= num_eig_vect ) break;
+          ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi2_c_re[m];
+          ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi2_c_im[m];
+        }
+        for ( int m=0; m<offset; m++ ) {
+          if ( m+j >= num_eig_vect ) break;
+          ((PRECISION*)(phi_c_pt+2*num_eig_vect+j+m))[0] = tmp_phi1_c_re[m+offset];
+          ((PRECISION*)(phi_c_pt+2*num_eig_vect+j+m))[1] = tmp_phi1_c_im[m+offset];
+        }
+        for ( int m=0; m<offset; m++ ) {
+          if ( m+j >= num_eig_vect ) break;
+          ((PRECISION*)(phi_c_pt+3*num_eig_vect+j+m))[0] = tmp_phi2_c_re[m+offset];
+          ((PRECISION*)(phi_c_pt+3*num_eig_vect+j+m))[1] = tmp_phi2_c_im[m+offset];
+        }
       }
     }
-  }
+  else
+#endif  
+    for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
+      
+      int offset = SIMD_LENGTH_PRECISION;
+      // loop over blocks of SIMD_LENGTH_PRECISION vectors
+      for ( j=0; j<num_eig_vect; j+=offset ) {
+        phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
+        phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
+        operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
+        
+        // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving
+        // complex components and masking
+        // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator)
+        PRECISION tmp_phi_c_re[2*offset];
+        PRECISION tmp_phi_c_im[2*offset];
+        mm_PRECISION zero =  mm_setzero_PRECISION();
+        for ( k1=0; k1<2*offset; k1+=offset ) {
+          mm_store_PRECISION(tmp_phi_c_re+k1, zero);
+          mm_store_PRECISION(tmp_phi_c_im+k1, zero);
+        }
+        
+        for ( k=0; k<aggregate_sites; k++ ) {
+          // offset used for 2 components of gamma5-symmetry stuff
+          int low_high_offset = 0;
+          for ( k1=0; k1<2; k1++ ) {
+            for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
+            // phi is the same for all eigenvectors -> broadcast
+              mm_PRECISION phi_re = mm_set1_PRECISION(((PRECISION *)phi_pt)[0]);
+              mm_PRECISION phi_im = mm_set1_PRECISION(((PRECISION *)phi_pt)[1]);
+
+              mm_PRECISION operator_re = mm_load_PRECISION((PRECISION *)operator);
+              mm_PRECISION operator_im = mm_load_PRECISION((PRECISION *)operator+offset);
+              mm_PRECISION phi_c_re = mm_load_PRECISION(tmp_phi_c_re+low_high_offset);
+              mm_PRECISION phi_c_im = mm_load_PRECISION(tmp_phi_c_im+low_high_offset);
+              
+              cfmadd_conj_PRECISION(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im);
+              
+              mm_store_PRECISION(tmp_phi_c_re+low_high_offset, phi_c_re);
+              mm_store_PRECISION(tmp_phi_c_im+low_high_offset, phi_c_im);
+              // skip to next real line of matrix
+              operator += offset;
+              phi_pt++;
+            }
+            low_high_offset = offset;
+          }
+        }
+        
+        for ( int m=0; m<offset; m++ ) {
+          if ( m+j >= num_eig_vect ) break;
+          ((PRECISION*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m];
+          ((PRECISION*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m];
+        }
+        
+        for ( int m=0; m<offset; m++ ) {
+          if ( m+j >= num_eig_vect ) break;
+          ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset];
+          ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset];
+        }
+      }
+    }
+#endif
   
   SYNC_HYPERTHREADS(threading)
   START_LOCKED_MASTER(threading)
@@ -205,5 +543,3 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str
   END_LOCKED_MASTER(threading)
   PROF_PRECISION_STOP( _PR, 1, threading );
 }
-
-#endif
diff --git a/src/interpolation_generic.h b/src/interpolation_generic.h
index 97be6ec..90e9051 100644
--- a/src/interpolation_generic.h
+++ b/src/interpolation_generic.h
@@ -22,17 +22,16 @@
 #ifndef INTERPOLATION_PRECISION_HEADER
   #define INTERPOLATION_PRECISION_HEADER
 
-  struct Thread;
   
   void interpolation_PRECISION_alloc( level_struct *l );
   void interpolation_PRECISION_free( level_struct *l );
   void interpolation_PRECISION_dummy_alloc( level_struct *l );
   void interpolation_PRECISION_dummy_free( level_struct *l );
   
-  void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading );
-  void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading );
-  void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading );
+  void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading );
+  void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading );
+  void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading );
   
-  void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading );
+  void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, Thread *threading );
 #endif
 
diff --git a/src/io.c b/src/io.c
index 354d545..deb104c 100644
--- a/src/io.c
+++ b/src/io.c
@@ -373,7 +373,7 @@ unsigned int initFile( char *filename, const int mode, level_struct *l ) {
     lls[1] = l->local_lattice[1];
     lls[2] = l->local_lattice[2];
     lls[3] = l->local_lattice[0];
-    writeSmallDataset_double(configgroup_id, "m0", l->real_shift);
+    writeSmallDataset_double(configgroup_id, "m0", g.m0);
     writeSmallDataset_double(configgroup_id, "csw", g.csw);
     writeSmallDataset_double(configgroup_id, "plaquette_clov", g.plaq_clov);
     writeSmallDataset_double(configgroup_id, "plaquette_hopp", g.plaq_hopp);
@@ -548,7 +548,7 @@ void write_header_mg( FILE **file, double *lambda, char* vector_type, int n, lev
   fprintf( *file, "<header>\n" );
   fprintf( *file, "%s\n", vector_type );
   fprintf( *file, "clifford basis: %s\n", CLIFFORD_BASIS );
-  fprintf( *file, "m0: %.14lf\n", l->real_shift );
+  fprintf( *file, "m0: %.14lf\n", g.m0 );
   fprintf( *file, "csw: %.14lf\n", g.csw );
   fprintf( *file, "clov plaq: %.14lf\n", g.plaq_clov );
   fprintf( *file, "hopp plaq: %.14lf\n", g.plaq_hopp );
diff --git a/src/lime_io.c b/src/lime_io.c
index 9dfa5e3..a082271 100644
--- a/src/lime_io.c
+++ b/src/lime_io.c
@@ -33,7 +33,7 @@ typedef struct lime_fileinfo {
  *  LIME functions
  *                                                                                   
  * In DDalphaAMG format:     
- ** t slowest running index	         
+ ** t slowest running index           
  ** x fastest running index            
  ** all positive directions                
  ** ordering: +T,+Z,+Y,+X         
@@ -502,7 +502,7 @@ void lime_write_vector( double *phi, char *filename ) {
         }
   }
   
-  if ( g.my_rank == 0 ) {	
+  if ( g.my_rank == 0 ) {  
     for ( i=0; i<bar_size; i++ ) {
       byteswap8( (char *) ( buffer_pt->data + i ) );
     }
diff --git a/src/linalg.c b/src/linalg.c
index 3487404..cdc1171 100644
--- a/src/linalg.c
+++ b/src/linalg.c
@@ -21,7 +21,6 @@
 
 #include "main.h"
 
-#ifndef OPTIMIZED_LINALG_float
 void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi,
                                      vector_float psi, int start, int end, level_struct *l,
                                      struct Thread *threading ) {
@@ -36,6 +35,8 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_
 
   SYNC_CORES(threading)
  
+#ifndef OPTIMIZED_LINALG_float
+
   compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
   for(int c=0; c<count; c++) {
     for ( i=thread_start; i<thread_end; ) {
@@ -43,6 +44,31 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_
     }
   }
 
+#else
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, (12/SIMD_LENGTH_float)*SIMD_LENGTH_float);
+  for( int c=0; c<count; c++) {
+    for ( i=thread_start; i<thread_end; ) {
+      mm_float psi_re; mm_float psi_im;
+      mm_float phi_re; mm_float phi_im;
+      mm_float result_re; mm_float result_im;
+
+      cload_float( (float*)(psi+i), &psi_re, &psi_im );
+      cload_float( (float*)(phi[c]+i), &phi_re, &phi_im );
+      cmul_conj_float(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
+      i += SIMD_LENGTH_float;
+
+      for ( int j=1; j<12/SIMD_LENGTH_float; j++ ) {
+        cload_float( (float*)(psi+i ), &psi_re, &psi_im );
+        cload_float( (float*)(phi[c]+i), &phi_re, &phi_im );
+        cfmadd_conj_float(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
+        i += SIMD_LENGTH_float;
+      }
+
+      results[c] += mm_reduce_add_float(result_re) + I* mm_reduce_add_float(result_im);
+    }
+  }
+#endif
+
   START_NO_HYPERTHREADS(threading)
   ((complex_double **)threading->workspace)[threading->core] = results;
   END_NO_HYPERTHREADS(threading)
@@ -60,7 +86,7 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_
 
   PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
 }
-#endif
+
 
 double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) {
   
diff --git a/src/linalg.h b/src/linalg.h
index 4182def..62e95b5 100644
--- a/src/linalg.h
+++ b/src/linalg.h
@@ -24,16 +24,8 @@
 
   struct Thread;
   
-  void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha,
-                               int sign, int count, int start, int end, level_struct *l );
-  
-  void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha,
-                                 int sign, int count, int start, int end, level_struct *l );
-  
-  void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi,
-                                       vector_float psi, int start, int end, level_struct *l,
-                                       struct Thread *threading );
-                                       
+  void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, vector_float psi,
+                                       int start, int end, level_struct *l, struct Thread *threading );
   double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading );
   
 #endif
diff --git a/src/linalg_generic.c b/src/linalg_generic.c
index 58153e1..635426c 100644
--- a/src/linalg_generic.c
+++ b/src/linalg_generic.c
@@ -21,57 +21,69 @@
 
 #include "main.h"
 
-#include "sse_float_intrinsic.h"
-#include "sse_linalg.h"
-#include "sse_linalg_PRECISION.h"
-
-#ifndef OPTIMIZED_LINALG_PRECISION
-complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) {
+complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end,
+                                                  level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _GIP, threading );
   complex_PRECISION local_alpha = 0, global_alpha = 0;
 
   int thread_start;
   int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
   
-  SYNC_CORES(threading)
+  SYNC_CORES(threading);
   
+#ifndef OPTIMIZED_LINALG_PRECISION
+
+  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
   VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += conj_PRECISION(phi[i])*psi[i], i++, l );
+  
+#else
+  compute_core_start_end_custom( start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION );
+  mm_PRECISION alpha_re = mm_setzero_PRECISION();
+  mm_PRECISION alpha_im = mm_setzero_PRECISION();
+  
+  for( int i=thread_start; i<thread_end; i+=SIMD_LENGTH_PRECISION ) {
+    mm_PRECISION phi_re; mm_PRECISION phi_im;
+    mm_PRECISION psi_re; mm_PRECISION psi_im;
+    cload_PRECISION( (PRECISION*)(phi+i), &phi_re, &phi_im );
+    cload_PRECISION( (PRECISION*)(psi+i), &psi_re, &psi_im );
+    cfmadd_conj_PRECISION( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
+  }
+  local_alpha = mm_reduce_add_PRECISION( alpha_re ) + I* mm_reduce_add_PRECISION( alpha_im );
+#endif
 
   // sum over cores
   START_NO_HYPERTHREADS(threading)
-  ((complex_PRECISION *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
+    ((complex_PRECISION *)threading->workspace)[threading->core] = local_alpha;
+  END_NO_HYPERTHREADS(threading);
   // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((complex_PRECISION *)threading->workspace)[0] += ((complex_PRECISION *)threading->workspace)[i];
-  local_alpha = ((complex_PRECISION *)threading->workspace)[0];
-  END_MASTER(threading)
+  SYNC_CORES(threading);
+  MASTER(threading) {
+    for(int i=1; i<threading->n_core; i++)
+      ((complex_PRECISION *)threading->workspace)[0] += ((complex_PRECISION *)threading->workspace)[i];
+    local_alpha = ((complex_PRECISION *)threading->workspace)[0];
+  }
   
   if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_PRECISION_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
-    PROF_PRECISION_STOP( _ALLR, 1 );
-    ((complex_PRECISION *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
+    MASTER(threading) {
+      PROF_PRECISION_START( _ALLR );
+      MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
+      PROF_PRECISION_STOP( _ALLR, 1 );
+      ((complex_PRECISION *)threading->workspace)[0] = global_alpha;
+    }
     // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
+    SYNC_MASTER_TO_ALL(threading);
     global_alpha = ((complex_PRECISION *)threading->workspace)[0];
     PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
     return global_alpha;
   } else {
     // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
+    SYNC_MASTER_TO_ALL(threading);
     local_alpha = ((complex_PRECISION *)threading->workspace)[0];
     PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
     return local_alpha;
   }
 }
-#endif
 
 
 complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) {
@@ -103,7 +115,6 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_
 }
 
 
-#if !defined( OPTIMIZED_LINALG_PRECISION ) 
 void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi,
     int start, int end, level_struct *l, struct Thread *threading ) {
 
@@ -116,25 +127,40 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result
   int thread_end;
 
   SYNC_CORES(threading)
+
+#ifndef OPTIMIZED_LINALG_PRECISION  
+
   if ( l->depth == 0 ) {
     compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
     for(int c=0; c<count; c++)
       for ( i=thread_start; i<thread_end; )
         FOR12( results[c] += conj_PRECISION(phi[c][i])*psi[i]; i++; )
   } else {
-#ifdef _M10TV
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 20);
-    for(int c=0; c<count; c++)
-      for ( i=thread_start; i<thread_end; )
-        FOR20( results[c] += conj_PRECISION(phi[c][i])*psi[i]; i++; )
-#else
     compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2);
     for(int c=0; c<count; c++)
       for ( i=thread_start; i<thread_end; )
         FOR2( results[c] += conj_PRECISION(phi[c][i])*psi[i]; i++; )
-#endif
   }
 
+#else
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION);
+  for(int c=0; c<count; c++) {
+    mm_PRECISION result_re = mm_setzero_PRECISION();
+    mm_PRECISION result_im = mm_setzero_PRECISION();
+    for ( i=thread_start; i<thread_end; i+=SIMD_LENGTH_PRECISION ) {
+      mm_PRECISION phi_re; mm_PRECISION phi_im;
+      mm_PRECISION pdi_re; mm_PRECISION pdi_im;
+      
+      // deinterleave complex numbers into 4 real parts and 4 imag parts        
+      cload_PRECISION( (PRECISION*)(phi[c]+i), &phi_re, &phi_im );
+      cload_PRECISION( (PRECISION*)(psi+i), &pdi_re, &pdi_im );
+      
+      cfmadd_conj_PRECISION(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im);
+    }
+    results[c] += mm_reduce_add_PRECISION(result_re) + I*mm_reduce_add_PRECISION(result_im);
+  }
+#endif
+
   START_NO_HYPERTHREADS(threading)
   ((complex_PRECISION **)threading->workspace)[threading->core] = results;
   END_NO_HYPERTHREADS(threading)
@@ -152,7 +178,6 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result
 
   PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
 }
-#endif
 
 
 complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l  ) {
@@ -168,7 +193,7 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECI
   return numerator/denominator;
 }
 
-#ifndef OPTIMIZED_LINALG_PRECISION
+
 PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _GIP, threading );
@@ -177,45 +202,57 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s
 
   int thread_start;
   int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
   
-  SYNC_CORES(threading)
+  SYNC_CORES(threading);
+  
+#ifndef OPTIMIZED_LINALG_PRECISION
   
+  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
   VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += NORM_SQUARE_PRECISION(x[i]), i++, l );
+  
+#else
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION);
+  mm_PRECISION alpha = mm_setzero_PRECISION(); 
+  for( int i=thread_start; i<thread_end; i += SIMD_LENGTH_PRECISION/2 ) {
+    mm_PRECISION phi = mm_load_PRECISION((PRECISION*)(x+i));
+    alpha = mm_fmadd_PRECISION( phi, phi, alpha );
+  }
+  local_alpha = mm_reduce_add_PRECISION( alpha );
+#endif
 
   // sum over cores
   START_NO_HYPERTHREADS(threading)
-  ((PRECISION *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
+    ((PRECISION *)threading->workspace)[threading->core] = local_alpha;
+  END_NO_HYPERTHREADS(threading);
   // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((PRECISION *)threading->workspace)[0] += ((PRECISION *)threading->workspace)[i];
-  local_alpha = ((PRECISION *)threading->workspace)[0];
-  END_MASTER(threading)
+  SYNC_CORES(threading);
+  MASTER(threading) {
+    for(int i=1; i<threading->n_core; i++)
+      ((PRECISION *)threading->workspace)[0] += ((PRECISION *)threading->workspace)[i];
+    local_alpha = ((PRECISION *)threading->workspace)[0];
+  }
 
   if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_PRECISION_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
-    PROF_PRECISION_STOP( _ALLR, 1 );
-    ((PRECISION *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
+    MASTER(threading) {
+      PROF_PRECISION_START( _ALLR );
+      MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
+      PROF_PRECISION_STOP( _ALLR, 1 );
+      ((PRECISION *)threading->workspace)[0] = global_alpha;
+    }
     // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
+    SYNC_MASTER_TO_ALL(threading);
     global_alpha = ((PRECISION *)threading->workspace)[0];
     PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
     return (PRECISION)sqrt((double)global_alpha);
   } else {
     // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
+    SYNC_MASTER_TO_ALL(threading);
     local_alpha = ((PRECISION *)threading->workspace)[0];
     PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
     return (PRECISION)sqrt((double)local_alpha);
   }
 }
-#endif
+
 
 PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) {
      
@@ -245,6 +282,53 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_
   return (PRECISION)sqrt((double)local_alpha);
 }
 
+// vector storage for PRECISION precision
+void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l, Thread *threading ) {
+  
+  int i;
+  PROF_PRECISION_START( _SET, threading );
+
+  THREADED_VECTOR_FOR( i, start, end, phi[i] = value, i++, l, threading );
+
+  PROF_PRECISION_STOP( _SET, 1, threading );
+}
+
+void vector_PRECISION_define_real( vector_PRECISION phi, PRECISION value, int start, int end, level_struct *l, Thread *threading ) {
+  
+  int i;
+  PROF_PRECISION_START( _SET, threading );
+
+  PRECISION *phi_pt = (PRECISION*) phi;
+  THREADED_VECTOR_FOR( i, 2*start, 2*end, phi_pt[i] = value; phi_pt[i+1] = 0, i+=2, l, threading );
+
+  PROF_PRECISION_STOP( _SET, 1, threading );
+}
+
+void vector_PRECISION_define_zero( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading ) {
+  
+  int i;
+  PROF_PRECISION_START( _SET, threading );
+
+  PRECISION *phi_pt = (PRECISION*) phi;
+  THREADED_VECTOR_FOR( i, 2*start, 2*end, phi_pt[i] = phi_pt[i+1] = 0, i+=2, l, threading );
+
+  PROF_PRECISION_STOP( _SET, 1, threading );
+}
+
+
+void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading ) {
+  
+  int i;
+  PROF_PRECISION_START( _SET, threading );
+  
+  // this would yield different results if we threaded it, so we don't
+  START_LOCKED_MASTER(threading)
+  VECTOR_FOR( i=start, i<end, phi[i] = (PRECISION)(((double)rand()/(double)RAND_MAX))-0.5 + ( (PRECISION)((double)rand()/(double)RAND_MAX)-0.5)*_Complex_I, i++, l );
+  END_LOCKED_MASTER(threading)
+
+  PROF_PRECISION_STOP( _SET, 1, threading );
+}
+
 
 void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ) {
   
@@ -271,19 +355,31 @@ void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PREC
   PROF_PRECISION_STOP( _LA2, (double)(end-start)/(double)l->inner_vector_size );
 }
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ) {
+void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l, struct Thread *threading ) {
   
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-  PROF_PRECISION_START( _LA6 );
+  int thread_start, thread_end;
+  PROF_PRECISION_START( _LA6, threading );
+  
+#ifndef OPTIMIZED_LINALG_PRECISION
   
+  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
   VECTOR_FOR( int i=start, i<end, z[i] = alpha*x[i], i++, l );
   
-  if(thread == 0 && start != end)
-  PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
-}
+#else
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION);
+  mm_PRECISION alpha_re = mm_set1_PRECISION( creal_PRECISION(alpha) );
+  mm_PRECISION alpha_im = mm_set1_PRECISION( cimag_PRECISION(alpha) );
+  
+  for( int i=start; i<end; i+=SIMD_LENGTH_PRECISION ) {
+    mm_PRECISION z_re, z_im, x_re, x_im;
+    cload_PRECISION( (PRECISION*)(x+i), &x_re, &x_im );
+    cmul_PRECISION( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
+    cstore_PRECISION( (PRECISION*)(z+i), z_re, z_im );
+  }
 #endif
+  
+  PROF_PRECISION_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size, threading );
+}
 
 
 void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha,
@@ -307,126 +403,92 @@ void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, i
   
   int thread = omp_get_thread_num();
   if(thread == 0 && start != end)
-  PROF_PRECISION_START( _CPY );
-  
+    PROF_PRECISION_START( _CPY );
+
   VECTOR_FOR( int i=start, i<end, z[i] = x[i], i++, l );
   
   if(thread == 0 && start != end)
-  PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size );
+    PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size );
 }
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ) {
+void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha,
+                             int start, int end, level_struct *l, struct Thread *threading ) {
   
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_PRECISION_START( _LA8 );
+ int thread_start, thread_end;
+ PROF_PRECISION_START( _LA8, threading );
   
+#ifndef OPTIMIZED_LINALG_PRECISION
+  
+  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
   VECTOR_FOR( int i=start, i<end, z[i] = x[i] + alpha*y[i], i++, l );
   
-  if( thread == 0 && start != end )
-  PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
-}
+#else
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION);
+  mm_PRECISION alpha_re = mm_set1_PRECISION( creal_PRECISION(alpha) );
+  mm_PRECISION alpha_im = mm_set1_PRECISION( cimag_PRECISION(alpha) );
+  
+  for ( int i=start; i<end; i+=SIMD_LENGTH_PRECISION ) {
+    mm_PRECISION x_re, x_im, y_re, y_im;
+    cload_PRECISION( (PRECISION*)(x+i), &x_re, &x_im );
+    cload_PRECISION( (PRECISION*)(y+i), &y_re, &y_im );
+    cfmadd_PRECISION(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
+    cstore_PRECISION( (PRECISION*)(z+i), x_re, x_im );
+  }
 #endif
 
-#ifndef OPTIMIZED_LINALG_PRECISION
-void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha,
-                               int sign, int count, int start, int end, level_struct *l ) {
+  PROF_PRECISION_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size, threading );
+}
+
+void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, 
+                                   int count, int start, int end, level_struct *l, struct Thread *threading ) {
   
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_PRECISION_START( _LA8 );
+ int thread_start, thread_end;
+ PROF_PRECISION_START( _LA8, threading );
   
+#ifndef OPTIMIZED_LINALG_PRECISION
+
   complex_PRECISION alpha_signed[count];
   for ( int c=0; c<count; c++ ) {
     alpha_signed[c] = sign*alpha[c];
   }
   
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
   for ( int c=0; c<count; c++ ) {
-    for ( int i=start; i<end; ) {
+    for ( int i=thread_start; i<thread_end; ) {
       FOR12( z[i] += V[c][i]*alpha_signed[c]; i++; )
     }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_PRECISION_STOP( _LA8, (PRECISION)(count) );
-}
-#endif
-
-void vector_PRECISION_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
-  
-  int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
-
-  while ( eta < eta_end ) {
-    if(g.odd_even_table[i]==_ODD){
-      FOR12( *eta = (*phi); phi++; eta++; )
-	}
-    else if(g.odd_even_table[i]==_EVEN){
-      FOR12( *eta = 0; phi++; eta++; )
-	}
-    i++;
-  }
-}
-
-void vector_PRECISION_gamma5_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
-  
-  int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
-
-  while ( eta < eta_end ) {
-    if(g.odd_even_table[i]==_ODD){
-      FOR6( *eta = -(*phi); phi++; eta++; )
-      FOR6( *eta = (*phi); phi++; eta++; )
-	}
-    else if(g.odd_even_table[i]==_EVEN){
-      FOR12( *eta = 0; phi++; eta++; )
-	}
-    i++;
-  }
-}
+  }  
 
+#else
 
-void vector_PRECISION_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
-   
-  int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
-
-  while ( eta < eta_end ) {
-    if(g.odd_even_table[i]==_EVEN){
-      FOR12( *eta = (*phi); phi++; eta++; )
-	}
-    else if(g.odd_even_table[i]==_ODD){
-      FOR12( *eta = 0; phi++; eta++; )
-	}
-    i++;
+  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION);
+  for ( int c=0; c<count; c++ ) {
+    mm_PRECISION alpha_re, alpha_im;
+    alpha_re = mm_set1_PRECISION( sign*creal_PRECISION(alpha[c]) );
+    alpha_im = mm_set1_PRECISION( sign*cimag_PRECISION(alpha[c]) );
+
+    if ( abs_PRECISION(cimag_PRECISION(alpha[c])) < EPS_PRECISION ) {
+      for ( int i=thread_start; i<thread_end; i+=SIMD_LENGTH_PRECISION/2 ) {
+        mm_PRECISION z_re = mm_load_PRECISION( (PRECISION*)(z+i) );
+        mm_PRECISION V_re = mm_load_PRECISION( (PRECISION*)(V[c]+i) );
+        z_re = mm_fmadd_PRECISION( alpha_re, V_re, z_re );
+        mm_store_PRECISION( (PRECISION*)(z+i), z_re );
+      }
+    } else {
+      for ( int i=thread_start; i<thread_end; i+=SIMD_LENGTH_PRECISION ) {
+        mm_PRECISION z_re, z_im, V_re, V_im; 
+        cload_PRECISION( (PRECISION*)(z+i), &z_re, &z_im );
+        cload_PRECISION( (PRECISION*)(V[c]+i), &V_re, &V_im );
+        cfmadd_PRECISION( alpha_re, alpha_im, V_re, V_im, &z_re, &z_im );
+        cstore_PRECISION( (PRECISION*)(z+i), z_re, z_im );
+      }
+    }
   }
+#endif
+  
+  PROF_PRECISION_STOP( _LA8, (PRECISION)(count), threading );
 }
 
-void vector_PRECISION_gamma5_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
-  
-  int i = threading->start_site[l->depth];
-  vector_PRECISION eta_end = eta + threading->end_index[l->depth];
-  eta += threading->start_index[l->depth];
-  phi += threading->start_index[l->depth];
-
-  while ( eta < eta_end ) {
-    if(g.odd_even_table[i]==_EVEN){
-      FOR6( *eta = -(*phi); phi++; eta++; )
-      FOR6( *eta = (*phi); phi++; eta++; )
-	}
-    else if(g.odd_even_table[i]==_ODD){
-      FOR12( *eta = 0; phi++; eta++; )
-	}
-    i++;
-  }
-}
 
 void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, 
                                   int orthogonal, level_struct *l, Thread *threading ) {
@@ -439,7 +501,7 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k,
   complex_PRECISION ip[k], ip_buffer[2*k];      
   
   MALLOC( v_tmp, complex_PRECISION, l->inner_vector_size );
-  vector_PRECISION_define(v_tmp, 0, 0, l->inner_vector_size, l );
+  vector_PRECISION_define_zero( v_tmp,  0, l->inner_vector_size, l, threading );
   
   MALLOC( W_tmp, complex_PRECISION*, k );
   W_tmp[0] = NULL; 
@@ -448,7 +510,7 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k,
     W_tmp[j] = W_tmp[0]+j*l->inner_vector_size;
   
   for ( j=0; j<k; j++ ) {
-   vector_PRECISION_scale( W_tmp[j], W[j], diag[j], 0, l->inner_vector_size, l );
+    vector_PRECISION_scale( W_tmp[j], W[j], diag[j], 0, l->inner_vector_size, l, threading );
   }
   process_multi_inner_product_PRECISION( k, ip, W_tmp, v, 0, l->inner_vector_size, l, threading );
   
@@ -457,10 +519,10 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k,
     ip_buffer[j] = ip[j];
   }
   MPI_Allreduce( ip_buffer, ip_buffer+k, k, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
-  END_MASTER(threading)
-  SYNC_MASTER_TO_ALL(threading)  
+  END_MASTER(threading);
+  SYNC_MASTER_TO_ALL(threading);
   
-  vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l );
+  vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l, threading );
    
   if (orthogonal) 
     vector_PRECISION_minus( z, v, v_tmp, 0, l->inner_vector_size, l );
@@ -472,75 +534,6 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k,
   FREE( W_tmp, complex_PRECISION*, k );
 }
 
-void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vect, level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading );
-  SYNC_CORES(threading)
-  SYNC_HYPERTHREADS(threading)
-  int i, j, k, k1, k2, num_aggregates = l->s_PRECISION.num_aggregates,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
-      
-  complex_PRECISION alpha1, alpha2;
-  vector_PRECISION v_pt1, v_pt2;
-  PRECISION norm1, norm2;
-      
-  for ( j=threading->n_thread*threading->core+threading->thread; j<num_aggregates; j+=threading->n_thread*threading->n_core ) {
-    for ( k1=0; k1<num_vect; k1++ ) {
-      v_pt1 = V[k1] + j*aggregate_size;
-      
-      for ( k2=0; k2<k1; k2++ ) {
-        v_pt2 = V[k2] + j*aggregate_size;
-        alpha1 = 0; alpha2 = 0;
-        // V[k1] -= <V[k2],V[k1]> V[k2] | 2*j-th and 2*j+1-st aggregate
-        for ( i=0; i<aggregate_size; ) {
-          for ( k=0; k<offset; k++, i++ )
-            alpha1 += conj_PRECISION(v_pt2[i]) * v_pt1[i];
-          for ( k=0; k<offset; k++, i++ )
-            alpha2 += conj_PRECISION(v_pt2[i]) * v_pt1[i];
-        }
-        for ( i=0; i<aggregate_size; ) {
-          for ( k=0; k<offset; k++, i++ )
-            v_pt1[i] -=  alpha1 * v_pt2[i];
-          for ( k=0; k<offset; k++, i++ )
-            v_pt1[i] -=  alpha2 * v_pt2[i];
-        }
-      }
-      
-      norm1 = 0; norm2 = 0;
-      // V[k1] = V[k1]/norm(V[k1]) | 2*j-th and 2*j+1-st aggregate    
-      for ( i=0; i<aggregate_size; ) {
-        for ( k=0; k<offset; k++, i++ )
-          norm1 += NORM_SQUARE_PRECISION(v_pt1[i]);
-        for ( k=0; k<offset; k++, i++ )
-          norm2 += NORM_SQUARE_PRECISION(v_pt1[i]);
-      }
-      norm1 = 1/sqrt(norm1); norm2 = 1/sqrt(norm2);
-      for ( i=0; i<aggregate_size; ) {
-        for ( k=0; k<offset; k++, i++ )
-          v_pt1[i] =  norm1 * creal_PRECISION(v_pt1[i]) + I*norm1* cimag_PRECISION(v_pt1[i]);
-        for ( k=0; k<offset; k++, i++ )
-          v_pt1[i] =  norm2 * creal_PRECISION(v_pt1[i]) + I*norm2* cimag_PRECISION(v_pt1[i]);
-      }
-    }
-  }
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-  PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading );
-}
-
-
-void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, complex_PRECISION alpha,
-                                        int start, int end, level_struct *l ) {
-  
-  PROF_PRECISION_START( _LA6 );  
-  for ( int i=start; i<end; ) {
-    FOR6( eta1[i] = alpha*phi[i]; eta2[i] = _COMPLEX_PRECISION_ZERO; i++; )
-    FOR6( eta2[i] = alpha*phi[i]; eta1[i] = _COMPLEX_PRECISION_ZERO; i++; )
-  }
-  PROF_PRECISION_STOP( _LA6, 1 );
-}
-
-
 void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ) {
   
   PROF_PRECISION_START( _SET, threading );
@@ -553,187 +546,3 @@ void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, leve
   SYNC_CORES(threading)
   PROF_PRECISION_STOP( _SET, (double)(l->vector_size-l->inner_vector_size)/(double)l->inner_vector_size, threading );
 }
-
-
-void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int begin, const int n, level_struct *l, struct Thread *threading ) {
-  
-  // NOTE: only thread safe, if "buffer" is the same buffer for all threads belonging to a common MPI process
-  START_MASTER(threading)
-  PROF_PRECISION_START( _LA );
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-  
-  PRECISION beta;
-  int i, j, start, end;
-  
-  compute_core_start_end_custom( 0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var );
-  
-  for ( i=begin; i<n; i++ ) {
-    
-    complex_PRECISION tmp[i];
-    process_multi_inner_product_PRECISION( i, tmp, V, V[i], 0, l->inner_vector_size, l, threading );
-    SYNC_CORES(threading)
-    START_MASTER(threading)
-    for ( j=0; j<i; j++ ) {
-      buffer[j] = tmp[j];
-    }
-    END_MASTER(threading)
-    SYNC_MASTER_TO_ALL(threading)
-    
-    if ( i>0 ) {
-      START_MASTER(threading)
-      PROF_PRECISION_START( _ALLR );
-      MPI_Allreduce( buffer, buffer+n, i, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
-      PROF_PRECISION_STOP( _ALLR, 1 );
-      END_MASTER(threading)
-      SYNC_MASTER_TO_ALL(threading)
-    }
-    
-    for( j=0; j<i; j++ ) {
-      vector_PRECISION_saxpy( V[i], V[i], V[j], -(buffer+n)[j], start, end, l );
-      SYNC_CORES(threading)
-    }
-    
-    SYNC_CORES(threading)
-      
-    beta = global_norm_PRECISION( V[i], 0, l->inner_vector_size, l, threading );
-    SYNC_MASTER_TO_ALL(threading)
-    vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l );
-    SYNC_CORES(threading)
-  }
-  
-  START_MASTER(threading)
-  PROF_PRECISION_STOP( _LA, 1 );
-  END_MASTER(threading)
-  SYNC_CORES(threading)
-}
-
-
-#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION )
-void setup_gram_schmidt_PRECISION_compute_dots(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading) {
-
-  int thread_start;
-  int thread_end;
-  int cache_block_size = 12*64;
-  complex_PRECISION tmp[cache_block_size];
-
-  for(int i=0; i<2*offset; i++)
-    thread_buffer[i] = 0.0;
-
-  SYNC_CORES(threading)
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
-  
-  for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    coarse_gamma5_PRECISION( tmp, V[count]+i, 0, cache_block_size, l );
-    for ( int j=0; j<count; j++ ) {
-      for ( int k=0; k<cache_block_size; k++) {
-        thread_buffer[j]   += conj_PRECISION(V[j][i+k])*V[count][i+k];
-        thread_buffer[j+offset] += conj_PRECISION(V[j][i+k])*tmp[k];
-      }
-    }
-  }
-
-  START_NO_HYPERTHREADS(threading)
-  ((complex_PRECISION **)threading->workspace)[threading->core] = thread_buffer;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++) {
-    for(int j=0; j<count; j++) {
-      ((complex_PRECISION **)threading->workspace)[0][j]        += ((complex_PRECISION **)threading->workspace)[i][j];
-      ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset];
-    }
-  }
-  END_MASTER(threading)
-  // only master needs the result in this case (it will be distributed later)
-}
-#endif
-
-
-#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION )
-void setup_gram_schmidt_PRECISION_axpys(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading) {
-  
-  int thread_start;
-  int thread_end;
-  int cache_block_size = 12*64;
-  complex_PRECISION tmp[cache_block_size];
-
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
-
-  for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    for ( int j=0; j<count; j++ ) {
-      coarse_gamma5_PRECISION( tmp, V[j]+i, 0, cache_block_size, l );
-      for ( int k=0; k<cache_block_size; k++) {
-        V[count][i+k] -= thread_buffer[2*offset+j]*V[j][i+k];
-        V[count][i+k] -= thread_buffer[3*offset+j]*tmp[k];
-      }
-    }
-  }
-}
-#endif
-
-
-void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v,
-                                   complex_PRECISION *buffer, const int n, level_struct *l,
-                                   struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _GRAM_SCHMIDT, threading );
-  PRECISION beta;
-  int i, j;
-  int start = 0;
-  int end = l->inner_vector_size;
-  int thread_start = threading->start_index[l->depth];
-  int thread_end   = threading->end_index[l->depth];
-
-  complex_PRECISION thread_buffer[4*n];
-  
-  for ( i=0; i<4*n; i++ )
-    thread_buffer[i] = 0;
-  
-  for ( i=0; i<n; i++ ) {
-    
-    if ( l->depth > 0 ) {
-      coarse_gamma5_PRECISION( g5v, V[i], thread_start, thread_end, l );
-      for ( j=0; j<i; j++ ) {
-        thread_buffer[j] = process_inner_product_PRECISION( V[j], V[i], start, end, l, threading );
-        thread_buffer[j+n] = process_inner_product_PRECISION( V[j], g5v, start, end, l, threading );
-      }
-    }
-    else
-      setup_gram_schmidt_PRECISION_compute_dots( thread_buffer, V, i, n, start, end, l, threading);
-    
-    
-    START_LOCKED_MASTER(threading)
-    if ( i>0 ) {
-      PROF_PRECISION_START( _ALLR );
-      MPI_Allreduce( thread_buffer, thread_buffer+2*n, 2*n, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm );
-      PROF_PRECISION_STOP( _ALLR, 1 );
-    }
-    for ( j=2*n; j<4*n; j++ )
-      ((complex_PRECISION *)(threading->workspace))[j] = thread_buffer[j];
-    END_LOCKED_MASTER(threading)
-    for ( j=2*n; j<4*n; j++ )
-      thread_buffer[j] = ((complex_PRECISION *)(threading->workspace))[j];
-
-    
-    if ( l->depth > 0 ) {
-      for( j=0; j<i; j++ ) {
-        vector_PRECISION_saxpy( V[i], V[i], V[j], -(thread_buffer+2*n)[j], thread_start, thread_end, l );
-        coarse_gamma5_PRECISION( g5v, V[j], thread_start, thread_end, l );
-        vector_PRECISION_saxpy( V[i], V[i], g5v, -(thread_buffer+3*n)[j], thread_start, thread_end, l );
-      }
-    } else {
-      setup_gram_schmidt_PRECISION_axpys( thread_buffer, V, i, n, start, end, l, threading);
-    }
-    
-    beta = global_norm_PRECISION( V[i], start, end, l, threading );
-    vector_PRECISION_real_scale( V[i], V[i], 1.0/beta, thread_start, thread_end, l );
-  }
-  PROF_PRECISION_STOP( _GRAM_SCHMIDT, (double)(end-start)/(double)l->inner_vector_size, threading );
-}
-
diff --git a/src/linalg_generic.h b/src/linalg_generic.h
index 29af91e..7df053d 100644
--- a/src/linalg_generic.h
+++ b/src/linalg_generic.h
@@ -22,17 +22,6 @@
 #ifndef LINALG_PRECISION_HEADER
   #define LINALG_PRECISION_HEADER
   
-#ifdef _M10TV
-  #define VECTOR_FOR( start, end, expression, update, l ) do{ \
-    if ( l->depth == 0 ) { \
-      for ( start; end; ) \
-        FOR12( expression; update; ) \
-    } else { \
-      for ( start; end; ) \
-        FOR20( expression; update; ) \
-    } \
-  } while(0)
-#else
   #define VECTOR_FOR( start, end, expression, update, l ) do{ \
     if ( l->depth == 0 ) { \
       for ( start; end; ) \
@@ -42,20 +31,7 @@
         FOR2( expression; update; ) \
     } \
   } while(0)
-#endif
 
-
-#ifdef _M10TV
-  #define REAL_VECTOR_FOR( start, end, expression, update, l ) do{ \
-    if ( l->depth == 0 ) { \
-      for ( start; end; ) \
-        FOR24( expression; update; ) \
-    } else { \
-      for ( start; end; ) \
-        FOR40( expression; update; ) \
-    } \
-  } while(0)
-#else
   #define REAL_VECTOR_FOR( start, end, expression, update, l ) do{ \
     if ( l->depth == 0 ) { \
       for ( start; end; ) \
@@ -65,23 +41,7 @@
         FOR4( expression; update; ) \
     } \
   } while(0)
-#endif
-
 
-#ifdef _M10TV
-  #define THREADED_VECTOR_FOR( i, start_index, end_index, expression, update, l, threading ) do{ \
-    int thread_start, thread_end; \
-    if ( l->depth == 0 ) { \
-      compute_core_start_end_custom(start_index, end_index, &thread_start, &thread_end, l, threading, 12); \
-      for ( i=thread_start; i<thread_end; ) \
-        FOR12( expression; update; ) \
-    } else { \
-      compute_core_start_end_custom(start_index, end_index, &thread_start, &thread_end, l, threading, 20); \
-      for ( i=thread_start; i<thread_end; ) \
-        FOR20( expression; update; ) \
-    } \
-  } while(0)
-#else
   #define THREADED_VECTOR_FOR( i, start_index, end_index, expression, update, l, threading ) do{ \
     int thread_start, thread_end; \
     if ( l->depth == 0 ) { \
@@ -94,8 +54,6 @@
         FOR2( expression; update; ) \
     } \
   } while(0)
-#endif
-  
 
   struct Thread;
 
@@ -109,41 +67,28 @@
   PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading );
   
   complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l  );
+
+  void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l, Thread *threading );
+  void vector_PRECISION_define_real( vector_PRECISION phi, PRECISION value, int start, int end, level_struct *l, Thread *threading );
+  void vector_PRECISION_define_zero( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading );
+  void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading );
   void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x + y
   void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x - y
-  void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x
+  // z := alpha*x
+  void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end,
+                               level_struct *l, struct Thread *threading );
   void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha,
                                     int start, int end, level_struct *l );
-  void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y
-  void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ); // z := x
-  void vector_PRECISION_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void vector_PRECISION_gamma5_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void vector_PRECISION_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void vector_PRECISION_gamma5_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading );
-  void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, 
-                                  int orthogonal, level_struct *l, Thread *threading );
-  
-  void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  
-  // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt
-  void aggregate_gram_schmidt_block_PRECISION( PRECISION *V,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U,
-      int num_vec, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
+  // z := x + alpha*y
+  void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha,
+                               int start, int end, level_struct *l, struct Thread *threading );
+  void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, 
+                                     int count, int start, int end, level_struct *l, struct Thread *threading );
+  // z := x
+  void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l );
 
-  void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n, level_struct *l, struct Thread *threading );
-  void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v,
-                                     complex_PRECISION *buffer, const int n, level_struct *l,
-                                     struct Thread *threading );
-  void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2,
-                                          vector_PRECISION phi, complex_PRECISION alpha, int start, int end, level_struct *l );
+  void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, struct Thread *threading );
+  
   void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading );
   
 #endif
diff --git a/src/linsolve.c b/src/linsolve.c
index 720dc4c..411bc0c 100644
--- a/src/linsolve.c
+++ b/src/linsolve.c
@@ -47,7 +47,6 @@ void fgmres_MP_struct_alloc( int m, int n, int vl, double tol, const int prec_ki
                                  
   p->dp.print = g.vt.evaluation?0:1;             p->sp.print = g.vt.evaluation?0:1;
   p->dp.initial_guess_zero = 1;                  p->sp.initial_guess_zero = 1;
-  p->dp.shift = 0;                               p->sp.shift = 0;
   p->dp.v_start = 0;                             p->sp.v_start = 0;
   p->dp.v_end = l->inner_vector_size;            p->sp.v_end = l->inner_vector_size;
   
@@ -60,6 +59,10 @@ void fgmres_MP_struct_alloc( int m, int n, int vl, double tol, const int prec_ki
     g.p.eval_operator = d_plus_clover_double;
   }
   
+#ifdef HAVE_TM1p1
+  vl*=2;
+#endif
+
   // double precision part
   total = 0;
   total += (m+1)*m; // Hessenberg matrix
@@ -200,7 +203,12 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
     SYNC_MASTER_TO_ALL(threading)
     
     if( ol == 0) {
-      norm_r0 = creal(gamma0);
+     if (l->depth == 0 && !p->dp.initial_guess_zero) {
+       norm_r0 = global_norm_double( p->dp.b, start, end, l, threading );
+       printf0("| initial guess relative residual:            %le |\n", creal(gamma0)/norm_r0);
+     } else {
+       norm_r0 = creal(gamma0);
+     }
     } 
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
     else {
@@ -220,12 +228,11 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
     // inner loop in single precision
     for( il=0; il<p->dp.restart_length && finish==0; il++) {
       j = il; iter++;
-      arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner,
-                       p->sp.shift, &(p->sp), l, threading );
+      arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading );
       
       if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) {
         qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading );
-        gamma_jp1 = cabs( p->dp.gamma[j+1] );	  
+        gamma_jp1 = cabs( p->dp.gamma[j+1] );          
         
         if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) {
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
@@ -316,8 +323,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) {
 
 void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
                       complex_double **H, complex_double* buffer, int j, void (*prec)(),
-                      complex_float shift, gmres_float_struct *p, level_struct *l,
-                      struct Thread *threading ) {
+                      gmres_float_struct *p, level_struct *l, struct Thread *threading ) {
   
   SYNC_MASTER_TO_ALL(threading)
   SYNC_CORES(threading)
@@ -332,7 +338,6 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
   if ( prec != NULL ) {
     if ( p->kind == _LEFT ) {
       apply_operator_float( Z[0], V[j], p, l, threading );
-      if ( shift ) vector_float_saxpy( Z[0], Z[0], V[j], shift, start, end, l );
       prec( w, NULL, Z[0], _NO_RES, l, threading );
     } else {
       if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) {
@@ -342,11 +347,9 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
         prec( Z[j], NULL, V[j], _NO_RES, l, threading );
         apply_operator_float( w, Z[j], p, l, threading ); // w = D*Z[j]
       }
-      if ( shift ) vector_float_saxpy( w, w, Z[j], shift, start, end, l );
     }
   } else {
     apply_operator_float( w, V[j], p, l, threading ); // w = D*V[j]
-    if ( shift ) vector_float_saxpy( w, w, V[j], shift, start, end, l );
   }
 
   complex_double tmp[j+1];
@@ -370,7 +373,7 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
   complex_float alpha[j+1];
   for( i=0; i<=j; i++ )
     alpha[i] = (complex_float) -H[j][i];
-  vector_float_multi_saxpy( w, V, alpha, 1, j+1, start, end, l );
+  vector_float_multi_saxpy( w, V, alpha, 1, j+1, p->v_start, p->v_end, l, threading );
   
   complex_double tmp2 = global_norm_MP( w, p->v_start, p->v_end, l, threading );
   START_MASTER(threading)
@@ -390,11 +393,6 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y,
   
   int i, k;
   // start and end indices for vector functions depending on thread
-  int start;
-  int end;
-  // compute start and end indices for core
-  // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
-  compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
 
   START_MASTER(threading)
   
@@ -415,12 +413,12 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y,
   SYNC_MASTER_TO_ALL(threading)
   
   // x = V*y
-  vector_float_scale( x, V[0], (complex_float) y[0], start, end, l );
+    vector_float_scale( x, V[0], (complex_float) y[0], p->v_start, p->v_end, l, threading );
 
   complex_float alpha[j];
   for ( i=1; i<=j; i++ )
     alpha[i-1] = (complex_float) y[i];
-  vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, start, end, l );
+  vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, p->v_start, p->v_end, l, threading );
 }
 
 
diff --git a/src/linsolve.h b/src/linsolve.h
index 55ed9fd..86dd54f 100644
--- a/src/linsolve.h
+++ b/src/linsolve.h
@@ -29,8 +29,7 @@
                                      
   void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w,
                         complex_double **H, complex_double* buffer, int j, void (*prec)(),
-                        complex_float shift, gmres_float_struct *p, level_struct *l,
-                        struct Thread *threading );
+                        gmres_float_struct *p, level_struct *l, struct Thread *threading );
                       
   void compute_solution_MP( vector_float x, vector_float *V, complex_double *y,
                             complex_double *gamma, complex_double **H, int j,
diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c
index 365bdf2..ae8f167 100644
--- a/src/linsolve_generic.c
+++ b/src/linsolve_generic.c
@@ -39,7 +39,6 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) {
   p->gamma = NULL;
   p->c = NULL;
   p->s = NULL;
-  p->shift = 0;
   p->preconditioner = NULL;
   p->eval_operator = NULL;
 }
@@ -71,6 +70,10 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i
   p->eval_operator = eval_op; 
   p->tol = tol;
   p->kind = prec_kind;
+
+#ifdef HAVE_TM1p1
+  vl*=2;
+#endif
   
   if(m > 0) {
   total += (m+1)*m; // Hessenberg matrix
@@ -147,7 +150,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i
     p->timing = 1;
     p->print = g.vt.evaluation?0:1;
     p->initial_guess_zero = 1;
-    p->shift = 0;
     p->v_start = 0;
     p->v_end = l->inner_vector_size;
     p->op = &(g.op_PRECISION);
@@ -156,7 +158,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i
     p->timing = 0;
     p->print = 0;
     p->initial_guess_zero = 1;
-    p->shift = 0;
     p->v_start = 0;
     p->v_end = l->inner_vector_size;
     p->op = &(l->s_PRECISION.op);
@@ -165,7 +166,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i
     p->print = 0;
     p->initial_guess_zero = 1;
     p->layout = -1;
-    p->shift = 0;
     p->v_start = 0;
     p->v_end = l->inner_vector_size;
     if ( g.odd_even )
@@ -232,7 +232,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
 
   complex_PRECISION beta = 0;
 
-  double norm_r0=1, gamma_jp1=1, t0=0, t1=0;
+  PRECISION norm_r0=1, gamma_jp1=1, t0=0, t1=0;
   START_LOCKED_MASTER(threading)
 
   if ( l->depth==0 && ( p->timing || p->print ) ) prof_init( l );
@@ -258,7 +258,6 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
       res = _RES;
       if ( p->kind == _LEFT && p->preconditioner ) {
         apply_operator_PRECISION( p->Z[0], p->x, p, l, threading );
-        if ( p->shift ) vector_PRECISION_saxpy( p->Z[0], p->Z[0], p->x, p->shift, start, end, l );
         if ( g.method == 5 ) {
           START_LOCKED_MASTER(threading)
           g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 );
@@ -270,20 +269,25 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
       }
       vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); // compute r = b - w
     }
-    gamma0 = (complex_PRECISION) global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r)
+    gamma0 = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r)
     START_MASTER(threading)
     p->gamma[0] = gamma0;
-    END_MASTER(threading)
-    SYNC_MASTER_TO_ALL(threading)
+    END_MASTER(threading);
+    SYNC_MASTER_TO_ALL(threading);
     
-    if( ol == 0) {
-      norm_r0 = creal(p->gamma[0]);
+    if ( ol == 0 ) {
+     if (l->depth == 0 && !p->initial_guess_zero) {
+       norm_r0 = global_norm_PRECISION( p->b, p->v_start, p->v_end, l, threading );
+       printf0("| initial guess relative residual:            %le |\n", creal(gamma0)/norm_r0);
+     } else {
+       norm_r0 = creal(p->gamma[0]);
+     }
     }
-    
+
     vector_PRECISION_real_scale( p->V[0], p->r, 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0
 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI)
     if ( l->level == 0 && l->depth > 0 ) {
-      arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p->shift, p, l, threading );
+      arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p, l, threading );
     }
 #endif   
     
@@ -298,18 +302,18 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
       // one step of Arnoldi
 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI)
       if ( l->level == 0 && l->depth > 0 ) {
-        if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p->shift, p, l, threading ) ) {
+        if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) {
           printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+2, j+1 );
           break;
         }
       } else {
-        if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p->shift, p, l, threading ) ) {
+        if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) {
           printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j );
           break;
         }
       }
 #else
-      if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p->shift, p, l, threading ) ) {
+      if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) {
         printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j );
         break;
       }
@@ -330,7 +334,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread
         if( gamma_jp1/norm_r0 < p->tol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop
           finish = 1;
           START_MASTER(threading)
-            if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level );
+          if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level );
           END_MASTER(threading)
         }
       } else {
@@ -423,7 +427,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
   vector_PRECISION x, b, r, r_tilde, p, pp, v, s, t; // Krylov subspace size: 5
   complex_PRECISION alpha=1, beta=1, rho=1, rho_old=1, omega=1;
   int iter=0, maxiter;
-  double tol, b_norm, r_norm, s_norm;
+  PRECISION tol, b_norm, r_norm, s_norm;
   // start and end indices for vector functions depending on thread
   int start;
   int end;
@@ -437,11 +441,12 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
   
   vector_PRECISION_copy( r, b, start, end, l );
   vector_PRECISION_copy( r_tilde, b, start, end, l );
-  vector_PRECISION_define( x, 0, start, end, l );
-  vector_PRECISION_define( v, 0, start, end, l );
-  vector_PRECISION_define( s, 0, start, end, l );
-  vector_PRECISION_define( t, 0, start, end, l );
+  vector_PRECISION_define_zero( x, ps->v_start, ps->v_end, l, threading );
+  vector_PRECISION_define_zero( v, ps->v_start, ps->v_end, l, threading );
+  vector_PRECISION_define_zero( s, ps->v_start, ps->v_end, l, threading );
+  vector_PRECISION_define_zero( t, ps->v_start, ps->v_end, l, threading );
   b_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading );
+
   r_norm = b_norm;
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)  
   START_MASTER(threading)
@@ -453,6 +458,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
     
     rho_old = rho;
     rho = global_inner_product_PRECISION( r_tilde, r, ps->v_start, ps->v_end, l, threading );
+
     if ( rho == 0 ) {
       START_MASTER(threading)
       printf0("rho = 0: BiCGstab did not converge.\n");
@@ -464,27 +470,29 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr
       vector_PRECISION_copy( p, r, start, end, l );
     } else {
       beta = (rho/rho_old)*(alpha/omega);
-      vector_PRECISION_saxpy( pp, p,  v, -omega, start, end, l );
-      vector_PRECISION_saxpy( p,  r, pp,   beta, start, end, l );
+      vector_PRECISION_saxpy( pp, p,  v, -omega, ps->v_start, ps->v_end, l, threading );
+      vector_PRECISION_saxpy( p,  r, pp,   beta, ps->v_start, ps->v_end, l, threading );
     }    
     apply_operator_PRECISION( v, p, ps, l, threading );
     alpha = rho / global_inner_product_PRECISION( r_tilde, v, ps->v_start, ps->v_end, l, threading );
-    vector_PRECISION_saxpy( s, r, v, -alpha, start, end, l );
+    vector_PRECISION_saxpy( s, r, v, -alpha, ps->v_start, ps->v_end, l, threading );
     s_norm = global_norm_PRECISION( s, ps->v_start, ps->v_end, l, threading );
-    
+
     if ( s_norm/b_norm < tol ) {
-      vector_PRECISION_saxpy( x, x, p, alpha, start, end, l );
+      vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading );
       break;
     }
     
     apply_operator_PRECISION( t, s, ps, l, threading );
     omega = global_inner_product_PRECISION( t, s, ps->v_start, ps->v_end, l, threading )
           / global_inner_product_PRECISION( t, t, ps->v_start, ps->v_end, l, threading );
-    vector_PRECISION_saxpy( x, x, p,  alpha, start, end, l );
-    vector_PRECISION_saxpy( x, x, s,  omega, start, end, l );
-    vector_PRECISION_saxpy( r, s, t, -omega, start, end, l );
     
+    vector_PRECISION_saxpy( x, x, p,  alpha, ps->v_start, ps->v_end, l, threading );
+    vector_PRECISION_saxpy( x, x, s,  omega, ps->v_start, ps->v_end, l, threading );
+    vector_PRECISION_saxpy( r, s, t, -omega, ps->v_start, ps->v_end, l, threading );
+
     r_norm = global_norm_PRECISION( r, ps->v_start, ps->v_end, l, threading );
+
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
     START_MASTER(threading)
     if ( iter % 100 == 0 ) printf0("| biCGstab relres: %12.6le,  iterations: %-8d     |\n", r_norm/b_norm, iter );
@@ -510,7 +518,7 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
   vector_PRECISION r_old, r_new, r_true, p, pp, Dp, x, b;
   complex_PRECISION alpha, beta=0, gamma;
   int maxiter, iter=0;
-  double tol, r0_norm, r_norm, prod_rr_old, t0=0, t1=0;
+  PRECISION tol, r0_norm, r_norm, prod_rr_old, t0=0, t1=0;
   // start and end indices for vector functions depending on thread
   int start;
   int end;
@@ -529,14 +537,16 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
   // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
   compute_core_start_end(ps->v_start, ps->v_end, &start, &end, l, threading);
 
-  vector_PRECISION_define( x, 0, start, end, l );
+  vector_PRECISION_define_zero( x, ps->v_start, ps->v_end, l, threading );
   apply_operator_PRECISION( Dp, x, ps, l, threading );
   vector_PRECISION_minus( pp, b, Dp, start, end, l );
   apply_operator_dagger_PRECISION( r_old, pp, ps, l, threading );
   
   vector_PRECISION_copy( p, r_old, start, end, l );
-  r0_norm = creal(global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading ));
-  prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading );
+  r0_norm = global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading );
+  //  prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading );
+  prod_rr_old = r0_norm*r0_norm;
+
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)
   if ( ps->print ) {
     START_MASTER(threading)
@@ -552,13 +562,13 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
     
     gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading );
     alpha = prod_rr_old / gamma;
-    vector_PRECISION_saxpy( x, x, p, alpha, start, end, l );
-    vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l );
+    vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading );
+    vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, ps->v_start, ps->v_end, l, threading );
     
     gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading );
     beta = gamma / prod_rr_old;
     
-    vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l );
+    vector_PRECISION_saxpy( p, r_new, p, beta, ps->v_start, ps->v_end, l, threading );
     vector_PRECISION_copy( r_old, r_new, start, end, l );
     prod_rr_old = gamma;
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)    
@@ -570,10 +580,11 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
 #endif
   }
   
-  r0_norm = creal(global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ));
+  r0_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading );
   apply_operator_PRECISION( Dp, x, ps, l, threading );
   vector_PRECISION_minus( r_true, b, Dp, start, end, l );
-  r_norm = creal(global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ));
+  r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading );
+
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)  
   if ( ps->print ) {
     START_MASTER(threading)
@@ -592,16 +603,16 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
     
     gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading );
     alpha = prod_rr_old / gamma;
-    vector_PRECISION_saxpy( x, x, p, alpha, start, end, l );
-    vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l );
+    vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading );
+    vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, ps->v_start, ps->v_end, l, threading );
     
     // residual update
-    vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, start, end, l );
-    r_norm = creal(global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ));
+    vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, ps->v_start, ps->v_end, l, threading );
+    r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading );
     gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading );
     beta = gamma / prod_rr_old;
     
-    vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l );
+    vector_PRECISION_saxpy( p, r_new, p, beta, ps->v_start, ps->v_end, l, threading );
     vector_PRECISION_copy( r_old, r_new, start, end, l );
     prod_rr_old = gamma;
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)    
@@ -621,7 +632,8 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
     END_MASTER(threading)
     apply_operator_PRECISION( Dp, x, ps, l, threading );
     vector_PRECISION_minus( pp, b, Dp, start, end, l );
-    beta = creal(global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading ));
+
+    beta = global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading );
     START_MASTER(threading)
     if ( ps->timing ) printf0("| exact relative residual: ||r||/||b|| = %e      |\n", creal(beta/r0_norm) );
     printf0("| elapsed wall clock time: %-12g seconds            |\n", t1-t0 );
@@ -648,7 +660,7 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread *
 
 int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w,
                             complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
-                            complex_PRECISION shift, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
+                            gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
 
 /*********************************************************************************
 * Extends the Arnoldi basis by one vector.
@@ -663,7 +675,6 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
 *   against all previous ones.
 * - void (*prec)(): Function pointer to preconditioner (can be NULL if no 
 *   preconditioning is used).
-* - complex_PRECISION shift: Denotes the dirac shift (can be 0).
 *********************************************************************************/
 #ifdef SINGLE_ALLREDUCE_ARNOLDI
 #ifdef PIPELINED_ARNOLDI
@@ -675,19 +686,18 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     int start, end, i;
     const complex_PRECISION sigma = 0;
     compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
-    
+
     if ( j == 0 )
       vector_PRECISION_copy( Z[0], V[0], start, end, l );
     else
       vector_PRECISION_copy( V[j], Z[j], start, end, l );
-  
+
     complex_PRECISION tmp[j+1];
     process_multi_inner_product_PRECISION( j+1, tmp, V, V[j], p->v_start, p->v_end, l, threading );
     START_MASTER(threading)
     PROF_PRECISION_START( _ALLR );
-    for( i=0; i<=j; i++ ) {
+    for( i=0; i<=j; i++ )
       buffer[i] = tmp[i];
-    }
     if ( g.num_processes > 1 ) {
       MPI_Iallreduce( buffer, H[MAX(0,j-1)], j+1, MPI_COMPLEX_PRECISION, MPI_SUM,
                       (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm, &req );
@@ -715,8 +725,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     SYNC_MASTER_TO_ALL(threading) 
     
     for( i=0; i<j; i++ )
-      vector_PRECISION_saxpy( V[j], V[j], V[i], -H[j-1][i], start, end, l );
-    
+      vector_PRECISION_saxpy( V[j], V[j], V[i], -H[j-1][i], p->v_start, p->v_end, l, threading );
     vector_PRECISION_real_scale( V[j], V[j], 1/H[MAX(0,j-1)][j], start, end, l );
     
     START_MASTER(threading)
@@ -727,14 +736,14 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     SYNC_MASTER_TO_ALL(threading)
     
     if ( j == 0 ) {
-      if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, start, end, l );
+      if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, p->v_start, p->v_end, l, threading );
     } else {
       for( i=0; i<j; i++ )
-        vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[i+1], -H[j-1][i], start, end, l );
+        vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[i+1], -H[j-1][i], p->v_start, p->v_end, l, threading );
     }
     
     vector_PRECISION_real_scale( Z[j+1], Z[j+1], 1/H[MAX(0,j-1)][j], start, end, l );
-    
+
   } else {
 #endif
     SYNC_MASTER_TO_ALL(threading)
@@ -742,13 +751,12 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     int start, end, i;
     const complex_PRECISION sigma = 0;
     compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
-    
+
     if ( prec != NULL ) {
       if ( p->kind == _LEFT ) {
         apply_operator_PRECISION( Z[0], V[j], p, l, threading );
-        if ( shift ) vector_PRECISION_saxpy( Z[0], Z[0], V[j], shift, start, end, l );
         prec( V[j+1], NULL, Z[0], _NO_RES, l, threading );
-        if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l );
+        if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, p->v_start, p->v_end, l, threading );
       } else {
         if ( l->level == 0 ) {
           prec( Z[j], NULL, V[j], _NO_RES, l, threading );
@@ -762,20 +770,19 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
             apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); // w = D*Z[j]
           }
         }
-        if ( shift ) vector_PRECISION_saxpy( V[j+1], V[j+1], Z[j], shift, start, end, l );
-        if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l );
+        if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, p->v_start, p->v_end, l, threading );
+
       }
     } else {
       apply_operator_PRECISION( V[j+1], V[j], p, l, threading ); // w = D*V[j]
-      if ( shift-sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], shift-sigma, start, end, l );
+      if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, p->v_start, p->v_end, l, threading );
     }
     
     complex_PRECISION tmp[j+2];
     process_multi_inner_product_PRECISION( j+2, tmp, V, V[j+1], p->v_start, p->v_end, l, threading );
     START_MASTER(threading)
-    for( i=0; i<=j+1; i++ ) {
+    for( i=0; i<=j+1; i++ )
       buffer[i] = tmp[i];
-    }
     
     if ( g.num_processes > 1 ) {
       PROF_PRECISION_START( _ALLR );
@@ -797,8 +804,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
     SYNC_MASTER_TO_ALL(threading)
 
     for( i=0; i<=j; i++ )
-      vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], start, end, l );
-    
+      vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], p->v_start, p->v_end, l, threading );
     vector_PRECISION_real_scale( V[j+1], V[j+1], 1/H[j][j+1], start, end, l );
     START_LOCKED_MASTER(threading)
     H[j][j] += sigma;
@@ -815,11 +821,10 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
   // compute start and end indices for core
   // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads
   compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading);
-  
+
   if ( prec != NULL ) {
     if ( p->kind == _LEFT ) {
       apply_operator_PRECISION( Z[0], V[j], p, l, threading );
-      if ( shift ) vector_PRECISION_saxpy( Z[0], Z[0], V[j], shift, start, end, l );
       prec( w, NULL, Z[0], _NO_RES, l, threading );
     } else {
       if ( l->level == 0 ) { 
@@ -832,12 +837,10 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
           prec( Z[j], NULL, V[j], _NO_RES, l, threading );
           apply_operator_PRECISION( w, Z[j], p, l, threading ); // w = D*Z[j]
         }
-        if ( shift ) vector_PRECISION_saxpy( w, w, Z[j], shift, start, end, l );
       }
     }
   } else {
     apply_operator_PRECISION( w, V[j], p, l, threading ); // w = D*V[j]
-    if ( shift ) vector_PRECISION_saxpy( w, w, V[j], shift, start, end, l );
   }
 
   // orthogonalization
@@ -857,8 +860,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
   END_MASTER(threading)
   SYNC_MASTER_TO_ALL(threading)
   for( i=0; i<=j; i++ )
-    vector_PRECISION_saxpy( w, w, V[i], -H[j][i], start, end, l );
-  
+    vector_PRECISION_saxpy( w, w, V[i], -H[j][i], p->v_start, p->v_end, l, threading );
 #ifdef REORTH
   // re-orthogonalization
   process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start, p->v_end, l, threading );
@@ -873,15 +875,15 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE
   
   for( i=0; i<=j; i++ )
     H[j][i] += tmp[i];
-  
+
   END_MASTER(threading)
   SYNC_MASTER_TO_ALL(threading)
   for( i=0; i<=j; i++ )
-    vector_PRECISION_saxpy( w, w, V[i], -tmp[i], start, end, l );
+    vector_PRECISION_saxpy( w, w, V[i], -tmp[i], p->v_start, p->v_end, l, threading );
 #endif
   
   // normalization
-  complex_PRECISION tmp2 = global_norm_PRECISION( w, p->v_start, p->v_end, l, threading );
+  PRECISION tmp2 = global_norm_PRECISION( w, p->v_start, p->v_end, l, threading );
   START_MASTER(threading)
   H[j][j+1] = tmp2;
   END_MASTER(threading)
@@ -972,12 +974,14 @@ void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, comple
   
   // x = x + V*y
   if ( ol ) {
-    for ( i=0; i<=j; i++ )
-      vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l );
+    for ( i=0; i<=j; i++ ) {
+      vector_PRECISION_saxpy( x, x, V[i], y[i], p->v_start, p->v_end, l, threading );
+    }
   } else {
-    vector_PRECISION_scale( x, V[0], y[0], start, end, l );
-    for ( i=1; i<=j; i++ )
-      vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l );
+    vector_PRECISION_scale( x, V[0], y[0], p->v_start, p->v_end, l, threading );
+    for ( i=1; i<=j; i++ ) {
+      vector_PRECISION_saxpy( x, x, V[i], y[i], p->v_start, p->v_end, l, threading );
+    }
   }
 }
 
@@ -998,17 +1002,17 @@ void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_
   
   START_UNTHREADED_FUNCTION(threading)
 
-  int i, end = (g.odd_even&&l->depth==0)?start+12*s->num_block_even_sites:start+s->block_vector_size,
-      n = l->block_iter;
+  int i, nv = l->num_lattice_site_var, n = l->block_iter,
+    end = (g.odd_even&&l->depth==0)?(start+nv*s->num_block_even_sites):(start+s->block_vector_size);
   vector_PRECISION Dr = s->local_minres_buffer[0];
   vector_PRECISION r = s->local_minres_buffer[1];
   vector_PRECISION lphi = s->local_minres_buffer[2];
   complex_PRECISION alpha;
   void (*block_op)() = (l->depth==0)?(g.odd_even?apply_block_schur_complement_PRECISION:block_d_plus_clover_PRECISION)
                                     :coarse_block_operator_PRECISION;
-  
+
   vector_PRECISION_copy( r, eta, start, end, l );
-  vector_PRECISION_define( lphi, 0, start, end, l );
+  vector_PRECISION_define_zero( lphi, start, end, l, no_threading );
   
   for ( i=0; i<n; i++ ) {
     // Dr = blockD*r
@@ -1016,9 +1020,9 @@ void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_
     // alpha = <Dr,r>/<Dr,Dr>
     alpha = local_xy_over_xx_PRECISION( Dr, r, start, end, l );
     // phi += alpha * r
-    vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l );
+    vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l, no_threading );
     // r -= alpha * Dr
-    vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l );
+    vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l, no_threading );
   }
   
   if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, lphi, start, end, l );
@@ -1038,7 +1042,7 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) {
 
   int i, j=-1, finish=0, iter=0, il, ol;
   complex_PRECISION beta = 0, alpha;
-  double norm_r0=0, t0=0, t1=0;
+  PRECISION r0_norm=0, t0=0, t1=0;
   
   if ( p->timing || p->print ) t0 = MPI_Wtime();
 #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK)  
@@ -1048,13 +1052,14 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) {
   
     if( ol == 0 && p->initial_guess_zero ) {
       vector_PRECISION_copy( p->r, p->b, p->v_start, p->v_end, l );
+
     } else {
       apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); // compute w = D*x
       vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); // compute r = b - w
     }
     
     if( ol == 0) {
-      norm_r0 = (complex_PRECISION) global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading );
+      r0_norm = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading );
     }
     
     for( il=0; il<p->restart_length && finish==0; il++ ) {
@@ -1066,16 +1071,16 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) {
       
       for( i=0; i<j; i++ ) {
         beta = global_inner_product_PRECISION( p->Z[i], p->Z[j], p->v_start, p->v_end, l, no_threading ) / p->gamma[i];
-        vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l );
-        vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l );
+        vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l, no_threading );
+        vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l, no_threading );
       }
       
       p->gamma[j] = global_inner_product_PRECISION( p->Z[j], p->Z[j], p->v_start, p->v_end, l, no_threading );
       alpha = global_inner_product_PRECISION( p->Z[j], p->r, p->v_start, p->v_end, l, no_threading ) / p->gamma[j];
-      vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l );
-      vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l );
+      vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l, no_threading );
+      vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l, no_threading );
       
-      alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / norm_r0;
+      alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / r0_norm;
       if ( creal(alpha) < p->tol ) {
         finish = 1;
         break;
@@ -1098,7 +1103,7 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) {
 #endif
     printf0("+----------------------------------------------------------+\n");
     printf0("|         FGCR iterations: %-6d                          |\n", iter );
-    printf0("| exact relative residual: ||r||/||b|| = %e      |\n", creal(beta)/norm_r0 );
+    printf0("| exact relative residual: ||r||/||b|| = %e      |\n", creal(beta)/r0_norm );
     printf0("| elapsed wall clock time: %-7lf seconds                |\n", t1-t0 );
     if ( g.coarse_time > 0 ) 
       printf0("|        coarse grid time: %-7lf seconds (%04.1lf%%)        |\n",
diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h
index 1a7f9cd..e28bb6b 100644
--- a/src/linsolve_generic.h
+++ b/src/linsolve_generic.h
@@ -36,8 +36,8 @@
   void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter,
                                int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
   int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w,
-                               complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
-                               complex_PRECISION shift, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
+                              complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(),
+                              gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading );
   void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s,
                             complex_PRECISION *c, complex_PRECISION *gamma, int j,
                             level_struct *l, struct Thread *threading );
diff --git a/src/main.c b/src/main.c
index 6723fa4..af5cf8c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -75,7 +75,7 @@ int main( int argc, char **argv ) {
   commonthreaddata = (struct common_thread_data *)malloc(sizeof(struct common_thread_data));
   init_common_thread_data(commonthreaddata);
   
-#pragma omp parallel num_threads(g.num_openmp_processes)
+  THREADED(g.num_openmp_processes)
   {
     struct Thread threading;
     setup_threading(&threading, commonthreaddata, &l);
diff --git a/src/main.h b/src/main.h
index 4bdf103..0c9dc60 100644
--- a/src/main.h
+++ b/src/main.h
@@ -38,19 +38,21 @@
   #define EPS_float 1E-6
   #define EPS_double 1E-14
 
-  #define HAVE_TM // flag for enable twisted mass
+  #define HAVE_TM      // flag for enable twisted mass
+  #define HAVE_TM1p1   // flag for enable doublet for twisted mass
+
   #undef INIT_ONE_PREC // flag undef for enabling additional features in the lib
   
-  #define FOR2( e ) { e e }
-  #define FOR3( e ) { e e e }
-  #define FOR4( e ) { e e e e }
-  #define FOR10( e ) { e e e e e  e e e e e }
-  #define FOR20( e ) { e e e e e  e e e e e  e e e e e  e e e e e  }
-  #define FOR40( e ) { e e e e e  e e e e e  e e e e e  e e e e e  e e e e e  e e e e e  e e e e e  e e e e e }
+  #define FOR2( e )  { e e }
+  #define FOR3( e )  { e e e }
+  #define FOR4( e )  { e e e e }
   #define FOR6( e )  { e e e  e e e }
+  #define FOR10( e ) { e e e e e  e e e e e }
   #define FOR12( e ) { e e e  e e e  e e e  e e e }
-  #define FOR24( e ) { e e e  e e e  e e e  e e e  e e e  e e e  e e e  e e e }
+  #define FOR20( e ) { FOR10( e ) FOR10( e )  }
+  #define FOR24( e ) { FOR12( e ) FOR12( e ) }
   #define FOR36( e ) { FOR12( e ) FOR12( e ) FOR12( e ) }
+  #define FOR40( e ) { FOR20( e ) FOR20( e ) }
   #define FOR42( e ) { FOR36( e ) FOR6( e ) }
   
   #define SQUARE( e ) (e)*(e)
@@ -73,12 +75,14 @@
   #define cimag_float cimagf
   #define csqrt_double csqrt
   #define csqrt_float csqrtf
+  #define sqrt_double sqrt
+  #define sqrt_float sqrtf
   #define cpow_double cpow
   #define cpow_float cpowf
   #define pow_double pow
   #define pow_float powf
-  #define abs_float fabs
-  #define abs_double abs
+  #define abs_double fabs
+  #define abs_float fabsf
   
 #ifdef SSE
   #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \
@@ -177,6 +181,7 @@
   #define DEBUGOUTPUT( A, FORMAT )
   #endif
 
+  #include "simd_vectorization_control.h"
   #include "vectorization_control.h"
   #include "threading.h"
 
@@ -185,7 +190,7 @@
   enum { _NO_DEFAULT_SET, _DEFAULT_SET };
   enum { _NO_REORDERING, _REORDER };
   enum { _ADD, _COPY };
-  enum { _ORDINARY, _SCHWARZ };
+  enum { _ORDINARY, _SCHWARZ, _ODDEVEN };
   enum { _RES, _NO_RES };
   enum { _STANDARD, _LIME }; //formats
   enum { _READ, _WRITE };
@@ -200,6 +205,7 @@
       _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant!
   enum { _VTS = 20 };
   enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB };
+  enum { _SSE, _AVX };
   
   typedef struct block_struct {
     int start, color, no_comm, *bt;
@@ -297,6 +303,7 @@
     int *local_lattice;
     int *block_lattice;
     int num_eig_vect;
+    int num_parent_eig_vect;
     int coarsening[4];
     int global_splitting[4];
     int periodic_bc[4];
@@ -319,13 +326,7 @@
     int schwarz_vector_size;
     int D_size;
     int clover_size;
-    // operator
-    double real_shift;
-    complex_double dirac_shift, even_shift, odd_shift;
-#ifdef HAVE_TM
     int block_size;
-    complex_double tm_shift, tm_even_shift, tm_odd_shift;
-#endif
     // buffer vectors
     vector_float vbuf_float[9], sbuf_float[2];
     vector_double vbuf_double[9], sbuf_double[2];
@@ -337,10 +338,8 @@
     
     // next coarser level
     struct level_struct *next_level;
-    
   } level_struct;
 
-
   typedef struct global_struct {
     
     FILE *logfile;
@@ -369,15 +368,21 @@
     // profiling, analysis, output
     int coarse_iter_count, iter_count, iterator, print, conf_flag, setup_flag, in_setup;
     double coarse_time, prec_time, *output_table[8], cur_storage, max_storage, total_time,
-      plaq_hopp, plaq_clov, norm_res, plaq, setup_m0, solve_m0, bicgstab_tol, twisted_bc[4],
-      test;
+      plaq_hopp, plaq_clov, norm_res, plaq, bicgstab_tol, twisted_bc[4], test;
+
+    double m0, setup_m0;
 
 #ifdef HAVE_TM
     // twisted mass parameters
     int downprop;
-    double tm_mu, setup_tm_mu, tm_mu_odd_shift, tm_mu_even_shift, *tm_mu_factor;
+    double mu, setup_mu, mu_odd_shift, mu_even_shift, *mu_factor;
 #endif
-           
+
+#ifdef HAVE_TM1p1           
+    int n_flavours;
+    double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift, *epsbar_factor;
+#endif
+
     // index functions for external usage
     int (*conf_index_fct)(), (*vector_index_fct)();
     int *odd_even_table;
@@ -462,29 +467,19 @@
 // functions
 #include "clifford.h"
 
+#ifdef SIMD
+#include "simd_complex_float.h"
+#include "simd_complex_double.h"
+#include "simd_blas_float.h"
+#include "simd_blas_double.h"
+#endif
 #ifdef SSE
 #include "vectorization_dirac_float.h"
 #include "vectorization_dirac_double.h"
-#include "blas_vectorized.h"
-#include "sse_blas_vectorized.h"
 #include "sse_complex_float_intrinsic.h"
 #include "sse_complex_double_intrinsic.h"
-#include "sse_coarse_operator_float.h"
-#include "sse_coarse_operator_double.h"
-#include "sse_linalg_float.h"
-#include "sse_linalg_double.h"
-#include "sse_interpolation_float.h"
-#include "sse_interpolation_double.h"
-#include "sse_schwarz_float.h"
-#include "sse_schwarz_double.h"
-#else
-//no intrinsics
-#include "interpolation_float.h"
-#include "interpolation_double.h"
 #endif
 
-#include "data_float.h"
-#include "data_double.h"
 #include "data_layout.h"
 #include "io.h"
 #include "init.h"
@@ -500,6 +495,10 @@
 #include "linalg_double.h"
 #include "ghost_float.h"
 #include "ghost_double.h"
+#include "gram_schmidt_float.h"
+#include "gram_schmidt_double.h"
+#include "interpolation_float.h"
+#include "interpolation_double.h"
 #include "linsolve_float.h"
 #include "linsolve_double.h"
 #include "linsolve.h"
@@ -521,6 +520,8 @@
 #include "gathering_double.h"
 #include "coarse_operator_float.h"
 #include "coarse_operator_double.h"
+#include "coarse_coupling_float.h"
+#include "coarse_coupling_double.h"
 #include "coarse_oddeven_float.h"
 #include "coarse_oddeven_double.h"
 #include "var_table.h"
diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h
index c138409..690ef6b 100644
--- a/src/main_post_def_generic.h
+++ b/src/main_post_def_generic.h
@@ -26,19 +26,53 @@
   #include "dirac_PRECISION.h"
   #include "coarse_operator_PRECISION.h"
 
-
   static inline void apply_operator_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
+
     p->eval_operator( output, input, p->op, l, threading );
-    if ( p->shift ) {
-      int start, end;
-      compute_core_start_end_custom(p->v_start, p->v_end, &start, &end, l, threading, l->num_lattice_site_var );
-      vector_PRECISION_saxpy( output, output, input, -p->shift, start, end, l );
-    }
+
   }
   
   static inline void apply_operator_dagger_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) {
-    if ( l->depth > 0 ) apply_coarse_operator_dagger_PRECISION( output, input, &(l->s_PRECISION.op), l, threading );
-    else d_plus_clover_dagger_PRECISION( output, input, p->op, l, threading );
+
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      tau1_gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading );
+    } else
+#endif
+      {
+        gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading );
+#ifdef HAVE_TM
+        //TODO: change_mu_sign_PRECISION( p->op, l, threading );
+#endif
+      }
+
+    apply_operator_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], p, l, threading );
+
+#ifdef HAVE_TM1p1
+    if( g.n_flavours == 2 ) {
+      tau1_gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading );
+    } else
+#endif
+      {
+        gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading );
+#ifdef HAVE_TM
+        //TODO: change_mu_sign_PRECISION( p->op, l, threading );
+#endif
+      }
+    
+  }
+
+  static inline void test0_PRECISION( char* format, int depth, PRECISION test ) {
+    if ( g.my_rank == 0 && g.print >= 0 ) {
+      if ( test > EPS_PRECISION )
+        printf("\x1b[31m");
+      printf(format, depth, test);
+      if ( test > EPS_PRECISION )
+        printf("\x1b[0m");
+      if ( test > g.test )
+        g.test = test;
+      fflush(0);
+    }
   }
   
 #endif
diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h
index 21e5576..d61b1b5 100644
--- a/src/main_pre_def_generic.h
+++ b/src/main_pre_def_generic.h
@@ -45,10 +45,9 @@
   } gathering_PRECISION_struct;
   
   typedef struct {
-    config_PRECISION D, clover, oe_clover;
-#ifdef HAVE_TM
-    config_PRECISION odd_proj, tm_term;
-#endif
+    double m0;
+    config_PRECISION D, clover, clover_oo_inv;
+    config_PRECISION odd_proj; //identity on the odd sites
     int oe_offset, self_coupling, num_even_sites, num_odd_sites,
         *index_table, *neighbor_table, *translation_table, table_dim[4],
         *backward_neighbor_table,
@@ -58,12 +57,22 @@
     OPERATOR_TYPE_PRECISION *D_vectorized;
     OPERATOR_TYPE_PRECISION *D_transformed_vectorized;
     OPERATOR_TYPE_PRECISION *clover_vectorized;
-    OPERATOR_TYPE_PRECISION *oe_clover_vectorized;
+    OPERATOR_TYPE_PRECISION *clover_oo_inv_vectorized;
+#ifdef HAVE_TM
+    double mu, mu_odd_shift, mu_even_shift;
+    config_PRECISION tm_term;
+#endif
+#ifdef HAVE_TM1p1
+    double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift;
+    config_PRECISION epsbar_term, clover_doublet_oo_inv;
+    OPERATOR_TYPE_PRECISION *clover_doublet_vectorized;
+    OPERATOR_TYPE_PRECISION *clover_doublet_oo_inv_vectorized;
+#endif
   } operator_PRECISION_struct;
   
   typedef struct {
     vector_PRECISION x, b, r, w, *V, *Z;
-    complex_PRECISION **H, *y, *gamma, *c, *s, shift;
+    complex_PRECISION **H, *y, *gamma, *c, *s;
     config_PRECISION *D, *clover;
     operator_PRECISION_struct *op;
     PRECISION tol;
@@ -75,7 +84,7 @@
   
   typedef struct {
     operator_PRECISION_struct op;
-    vector_PRECISION buf1, buf2, buf3, buf4, buf5, bbuf1, bbuf2, bbuf3, oe_bbuf[6];
+    vector_PRECISION buf1, buf2, buf3, buf4, buf5;
     vector_PRECISION oe_buf[4];
     vector_PRECISION local_minres_buffer[3];
     int block_oe_offset, *index[4], dir_length[4], num_blocks, num_colors,
diff --git a/src/oddeven_generic.c b/src/oddeven_generic.c
index d8da7af..801995f 100644
--- a/src/oddeven_generic.c
+++ b/src/oddeven_generic.c
@@ -76,6 +76,7 @@ void selfcoupling_cholesky_decomposition_PRECISION( const config_PRECISION outpu
   }
 }
 
+#ifdef HAVE_TM
 void selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, config_double input ) {
 
   /*********************************************************************************   
@@ -89,51 +90,105 @@ void selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, con
    *********************************************************************************/
 
   register int i, j, k;
-  int n, offset[4] = {0,12,6,27};
+  int n;
   config_double in_pt;
-  config_PRECISION out_pt = output;
-  complex_PRECISION L[6][6];
-  
+  config_PRECISION out_pt;
+
+  int offset[4] = {0,12,6,27};
+
+  // construct initial L = A for n=0, L = B for n=1, L row major
   for ( n=0; n<2; n++ ) {
-    // construct initial L = A for n=0, L = B for n=1, L row major
-    in_pt = input+offset[2*n];
+
+    out_pt = output + n*36;
+    
+    in_pt = input + offset[2*n];
     for ( j=0; j<6; j++ ) {
-      L[j][j] = (complex_PRECISION) *in_pt; in_pt++;
+      out_pt[6*j+j] = (complex_PRECISION) *in_pt; in_pt++;
     }
     
     in_pt = input+offset[2*n+1];
     for ( j=0; j<5; j++ ) {
       for ( i=j+1; i<6; i++ ) {
-	L[j][i] = (complex_PRECISION) *in_pt;
-        L[i][j] = (complex_PRECISION) conj_double(*in_pt); in_pt++;
+        out_pt[6*j+i] = (complex_PRECISION) *in_pt;
+        out_pt[6*i+j] = (complex_PRECISION) conj_double(*in_pt); in_pt++;
       }
     }
     
     // calculate LU
-    for ( k=0; k<6; k++ ) {
+    for ( k=0; k<5; k++ ) {
       for ( i=k+1; i<6; i++ ) {
-	L[i][k] = L[i][k]/L[k][k]; // acts on L
-	for ( j=k+1; j<6; j++ )
-	  L[i][j] = L[i][j]-L[i][k]*L[k][j]; // acts on both, L and U
+        out_pt[6*i+k] = out_pt[6*i+k]/out_pt[6*k+k]; // L: out(i,k) = out(i,k)/out(k,k)
+        for ( j=k+1; j<6; j++ )
+          out_pt[6*i+j] = out_pt[6*i+j]-out_pt[6*i+k]*out_pt[6*k+j]; // U: out(i,j) = out(i,j)-out(i,k)*out(k,j)
       }
     }
+  }
+}
+#endif
 
-    // output = tril(L,1) without diag row major 
-    for ( i=0; i<6; i++ ) {
-      for ( j=0; j<i; j++ ) {
-        *out_pt = L[i][j]; out_pt++;
+#ifdef HAVE_TM1p1
+void selfcoupling_LU_doublet_decomposition_PRECISION( const config_PRECISION output, config_double input ) {
+
+  /*********************************************************************************   
+   * Performs a LU decomposition for a selfcoupling term.
+   * Input = [ A 0 ]   , A=A*, B=B* (diagonals excluded)
+   *         [ 0 B ]               
+   * Input ordering: diag(A), diag(B), triu(A,1) row major, triu(B,1) row major
+   *                  (matlab notation).
+   * Output ordering: triu(L,1) + tril(U,0), i.e., output contains L and U without 
+   *                  the diagonal of L which is equal to 1
+   *********************************************************************************/
+
+  register int i, j, k;
+  int n;
+  config_double in_pt;
+  config_PRECISION out_pt = output;
+
+  int offset[8] = {0,12,24,54,6,18,39,60};
+  
+  // construct initial L = A for n=0, L = B for n=1, L row major
+  for ( n=0; n<2; n++ ) {
+
+    out_pt = output + n*144;
+
+    in_pt = input+offset[4*n];
+    for ( j=0; j<6; j++ ) {
+      out_pt[12*j+j] = (complex_PRECISION) *in_pt; in_pt++;
+    }
+    in_pt = input+offset[4*n+1];
+    for ( j=0; j<6; j++ ) {
+      out_pt[12*(j+6)+(j+6)] = (complex_PRECISION) *in_pt; in_pt++;
+    }
+    
+    in_pt = input+offset[4*n+2];
+    for ( j=0; j<5; j++ ) {
+      for ( i=j+1; i<6; i++ ) {
+        out_pt[12*(j+6)+(i+6)] = out_pt[12*j+i] = (complex_PRECISION) *in_pt;
+        out_pt[12*(i+6)+(j+6)] = out_pt[12*i+j] = (complex_PRECISION) conj_double(*in_pt); in_pt++;
       }
     }
-    // output =+ triu(U,0) *backward order row major (same order used in perform_fwd_bwd_subs)
-    for ( i=5; i>=0; i-- ) {
-      for ( j=i+1; j<6; j++ ) {
-        *out_pt = L[i][j]; out_pt++;
+    
+    in_pt = input+offset[4*n+3];
+    for ( j=0; j<6; j++ ) {
+      for ( i=0; i<6; i++ ) {
+        out_pt[12*(j+6)+i] = out_pt[12*j+(i+6)] = _COMPLEX_PRECISION_ZERO;
+      }
+    }
+    for ( j=0; j<6; j++ ) {
+      out_pt[12*(j+6)+j] = out_pt[12*j+(j+6)] = (complex_PRECISION) *in_pt; in_pt++;
+    }
+    
+    // calculate LU
+    for ( k=0; k<11; k++ ) {
+      for ( i=k+1; i<12; i++ ) {
+        out_pt[12*i+k] = out_pt[12*i+k]/out_pt[12*k+k]; // L: out(i,k) = out(i,k)/out(k,k)
+        for ( j=k+1; j<12; j++ )
+          out_pt[12*i+j] = out_pt[12*i+j]-out_pt[12*i+k]*out_pt[12*k+j]; // U: out(i,j) = out(i,j)-out(i,k)*out(k,j)
       }
-      *out_pt = L[i][i]; out_pt++;
     }
-
   }
 }
+#endif
 
 
 static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) {
@@ -172,7 +227,7 @@ static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vecto
   }
 }
 
-static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) {
+static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION LU ) {
 
 /*********************************************************************************
 * Solves L*U*x = b for x, i.e., the clover coupling for a single lattice 
@@ -183,27 +238,52 @@ static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector
 * Note: U is given by u_{ii}=1, u_{ij}=l_{ji}* / l_{ii} 
 *********************************************************************************/
   
-  register int i, j;
-  int n;
-
-  for ( n=0; n<2; n++ ) {
-    // forward substitution with L
-    for ( i=0; i<6; i++ ) {
-      x[i] = b[i];
-      for ( j=0; j<i; j++ ) {
-        x[i] = x[i] - *L * x[j]; L++;
+  register int i, j, n;
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2)
+    for ( n=0; n<2; n++ ) {
+      // solve x = U^(-1) L^(-1) b
+      // forward substitution with L
+      for ( i=0; i<12; i++ ) {
+        x[i] = b[i];
+        for ( j=0; j<i; j++ ) {
+          x[i] = x[i] - LU[i*12+j]*x[j];
+        }
       }
+      // backward substitution with U
+      for ( i=12-1; i>=0; i-- ) {
+        for ( j=i+1; j<12; j++ ) {
+          x[i] = x[i] - LU[i*12+j]*x[j];
+        }
+        x[i] = x[i]/LU[i*(12+1)];
+      }
+      x+=12;
+      b+=12;
+      LU+=12*12;
     }
-    // backward substitution with U
-    for ( i=5; i>=0; i-- ) {
-      for ( j=i+1; j<6; j++ ) {
-        x[i] = x[i] - *L * x[j]; L++;
+  else
+#endif
+    for ( n=0; n<2; n++ ) {
+      // solve x = U^(-1) L^(-1) b
+      // forward substitution with L
+      for ( i=0; i<6; i++ ) {
+        x[i] = b[i];
+        for ( j=0; j<i; j++ ) {
+          x[i] = x[i] - LU[i*6+j]*x[j];
+        }
       }
-      x[i] = x[i] / *L; L++;
+      // backward substitution with U
+      for ( i=6-1; i>=0; i-- ) {
+        for ( j=i+1; j<6; j++ ) {
+          x[i] = x[i] - LU[i*6+j]*x[j];
+        }
+        x[i] = x[i]/LU[i*(6+1)];
+      }
+      x+=6;
+      b+=6;
+      LU+=6*6;
     }
-    x+=6;
-    b+=6;
-  }
 }
 
 
@@ -252,32 +332,42 @@ static inline void LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x
 * - config_PRECISION LU: LU decomposition
 *********************************************************************************/
 
-  register int i, j;
-  int n;
-  complex_PRECISION z[6];
-  
-  for ( n=0; n<2; n++ ) {
-    LU+=15; // moving to U
-    // z = U x
-    for ( i=5; i>=0; i-- ) { //row
-      z[i] = 0;
-      for ( j=i+1; j<6; j++ ) { //column
-        z[i] += *LU *x[j]; LU++;
+  register int i, j, n;
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2)
+    for ( n=0; n<2; n++ ) {
+      for ( i=0; i<12; i++ ) {
+        y[i] = LU[i*(12+1)]*x[i];
+        for ( j=i+1; j<12; j++ )
+          y[i] += LU[i*12+j]*x[j];
       }
-      z[i] += *LU *x[i]; LU++;
-    }    
-    LU-=36;// moving to L
-    // y = L*z;
-    for ( i=0; i<6; i++ ) { // rows
-      y[i] = z[i];
-      for ( j=0; j<i; j++ ) { // columns
-        y[i] += *LU * z[j]; LU++;
+      // multiplication with L
+      for ( i=12-1; i>0; i-- )
+        for ( j=0; j<i; j++ )
+          y[i] += LU[i*12+j]*y[j];
+
+      x+=12;
+      y+=12;
+      LU+=12*12;
+    }
+  else
+#endif
+    for ( n=0; n<2; n++ ) {
+      for ( i=0; i<6; i++ ) {
+        y[i] = LU[i*(6+1)]*x[i];
+        for ( j=i+1; j<6; j++ )
+          y[i] += LU[i*6+j]*x[j];
       }
+      // multiplication with L
+      for ( i=6-1; i>0; i-- )
+        for ( j=0; j<i; j++ )
+          y[i] += LU[i*6+j]*y[j];
+
+      x+=6;
+      y+=6;
+      LU+=6*6;
     }
-    x+=6;
-    y+=6;
-    LU+=21;//moving to next
-  }
 }
 
 
@@ -290,39 +380,66 @@ void diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI
 * - vector_PRECISION y: Output vector.
 *********************************************************************************/
 
-  config_PRECISION sc = op->clover;
-  x += start; y += start;
-  if ( g.csw ) {
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2) {
+    x += start; y += start;
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-    PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start);
+    PRECISION *sc_pt = op->clover_doublet_vectorized + (start/24)*288;
     PRECISION *x_pt = (PRECISION*)x;
     PRECISION *y_pt = (PRECISION*)y;
-    for ( int i=start; i<end; i+=12 ) {
+    for ( int i=start; i<end; i+=24 ) {
       sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
-      y_pt+=2*12; x_pt+=2*12; sc_pt+=2*2*36;
+      y_pt+=2*24; x_pt+=2*24; sc_pt+=288;
     }
-#elif defined(HAVE_TM) 
-    sc += (start/12)*72;
+    config_PRECISION epsbar_term = op->epsbar_term+(start/24)*12;  
+    if ( g.n_flavours == 2 &&
+         ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) )
+      apply_doublet_coupling_PRECISION( x, y, epsbar_term, end-start );
+#else 
+    config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288;
     // diagonal blocks applied to the even sites
-    for ( int i=start; i<end; i+=12 ) {
+    for ( int i=start; i<end; i+=24 ) {
       LU_multiply_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=72;
-    }
-#else
-    sc += (start/12)*42;
-    // diagonal blocks applied to the even sites
-    for ( int i=start; i<end; i+=12 ) {
-      LLH_multiply_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=42;
-    }
+      y+=24; x+=24; sc+=288;
+    }    
 #endif
   } else {
-    sc += start;
-    for ( int i=start; i<end; i+=12 ) {
-      FOR12( *y = (*x)*(*sc); y++; x++; sc++; )
+#endif
+    x += start; y += start;
+    if ( g.csw ) {
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+      PRECISION *sc_pt = op->clover_vectorized + (start/12)*144;
+      PRECISION *x_pt = (PRECISION*)x;
+      PRECISION *y_pt = (PRECISION*)y;
+      for ( int i=start; i<end; i+=12 ) {
+        sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
+        y_pt+=2*12; x_pt+=2*12; sc_pt+=144;
+      }
+#elif defined(HAVE_TM) 
+      config_PRECISION sc = op->clover + (start/12)*72;
+      // diagonal blocks applied to the even sites
+      for ( int i=start; i<end; i+=12 ) {
+        LU_multiply_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=72;
+      }
+#else
+      config_PRECISION sc = op->clover + (start/12)*42;
+      // diagonal blocks applied to the even sites
+      for ( int i=start; i<end; i+=12 ) {
+        LLH_multiply_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=42;
+      }
+#endif
+    } else {
+      config_PRECISION sc = op->clover + start;
+      for ( int i=start; i<end; i+=12 ) {
+        FOR12( *y = (*x)*(*sc); y++; x++; sc++; )
+      }
     }
+#ifdef HAVE_TM1p1
   }
-
+#endif
 }
 
 // for debugging only
@@ -331,25 +448,38 @@ void diag_ee_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRE
 
   START_UNTHREADED_FUNCTION(threading)
 
-  int i, n1 = op->num_even_sites;
-  config_PRECISION sc = op->clover;
-  if ( g.csw ) {
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2) {
+    int i, n1 = op->num_even_sites;
+    config_PRECISION sc = op->clover_doublet_oo_inv;
     // diagonal blocks applied to the even sites
     for ( i=0; i<n1; i++ ) {
+        LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
+        y+=24; x+=24; sc+=288;
+    } 
+  } else {
+#endif
+    int i, n1 = op->num_even_sites;
+    config_PRECISION sc = op->clover;
+    if ( g.csw ) {
+      // diagonal blocks applied to the even sites
+      for ( i=0; i<n1; i++ ) {
 #ifndef HAVE_TM
-      LLH_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=42;
+        LLH_perform_fwd_bwd_subs_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=42;
 #else
-      LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=72;
+        LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=72;
 #endif
+      }
+    } else {
+      for ( i=0; i<n1; i++ ) {
+        FOR12( *y = (*x)/(*sc); y++; x++; sc++; )
+      }
     }
-  } else {
-    for ( i=0; i<n1; i++ ) {
-      FOR12( *y = (*x)/(*sc); y++; x++; sc++; )
-    }
+#ifdef HAVE_TM1p1
   }
-
+#endif
   END_UNTHREADED_FUNCTION(threading)
 }
 
@@ -365,30 +495,45 @@ void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI
 
   START_UNTHREADED_FUNCTION(threading)
 
-  int i, n1 = op->num_even_sites, n2 = op->num_odd_sites;
-  config_PRECISION sc = op->clover;
-  x += n1*12; y += n1*12;
-  // diagonal blocks applied to the odd sites
-  if ( g.csw ) {
-#ifndef HAVE_TM
-    sc += n1*42;
-    for ( i=0; i<n2; i++ ) {
-      LLH_multiply_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=42;
-    }
-#else
-    sc += n1*72;
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2) {
+    int i, n1 = op->num_even_sites, n2 = op->num_odd_sites;
+    config_PRECISION sc = op->clover_doublet_oo_inv + n1*288;
+    x += n1*24; y += n1*24;
+    // diagonal blocks applied to the even sites
     for ( i=0; i<n2; i++ ) {
       LU_multiply_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=72;
+      y+=24; x+=24; sc+=288;
     }
-#endif
   } else {
-    sc += n1*12;
-    for ( i=0; i<n2; i++ ) {
-      FOR12( *y = (*x)*(*sc); y++; x++; sc++; )
+#endif
+    int i, n1 = op->num_even_sites, n2 = op->num_odd_sites;
+    config_PRECISION sc = op->clover;
+    x += n1*12; y += n1*12;
+    // diagonal blocks applied to the odd sites
+    if ( g.csw ) {
+#ifndef HAVE_TM
+      sc += n1*42;
+      for ( i=0; i<n2; i++ ) {
+        LLH_multiply_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=42;
+      }
+#else
+      sc += n1*72;
+      for ( i=0; i<n2; i++ ) {
+        LU_multiply_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=72;
+      }
+#endif
+    } else {
+      sc += n1*12;
+      for ( i=0; i<n2; i++ ) {
+        FOR12( *y = (*x)*(*sc); y++; x++; sc++; )
+      }
     }
+#ifdef HAVE_TM1p1
   }
+#endif
 
   END_UNTHREADED_FUNCTION(threading)
 }
@@ -397,37 +542,61 @@ void diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISI
 void diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op,
                             level_struct *l, int start, int end ) {
 
-  config_PRECISION sc = op->clover;
-  x += start; y += start;
-  // inverse diagonal blocks applied to the odd sites
-  if ( g.csw ) {
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2) {
+    x += start; y += start;
+    // inverse diagonal blocks applied to the odd sites
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-    PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start);
+    PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288;
     PRECISION *x_pt = (PRECISION*)x;
     PRECISION *y_pt = (PRECISION*)y;
-    for ( int i=start; i<end; i+=12 ) {
-      sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
-      y_pt+=2*12; x_pt+=2*12; sc_pt+=2*2*36;
+    for ( int i=start; i<end; i+=24 ) {
+      sse_site_clover_doublet_PRECISION( y_pt, x_pt, sc_pt );
+      y_pt+=2*24; x_pt+=2*24; sc_pt+=2*288;
     }
-#elif defined(HAVE_TM)
-    sc += (start/12)*72;
-    for ( int i=start; i<end; i+=12 ) {
+#else
+    config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288;
+    for ( int i=start; i<end; i+=24 ) {
       LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=72;
+      y+=24; x+=24; sc+=288;
     }    
-#else
-    sc += (start/12)*42;
-    for ( int i=start; i<end; i+=12 ) {
-      LLH_perform_fwd_bwd_subs_PRECISION( y, x, sc );
-      y+=12; x+=12; sc+=42;
-    }
 #endif
   } else {
-    sc += start;
-    for ( int i=start; i<end; i+=12 ) {
-      FOR12( *y = (*x)/(*sc); y++; x++; sc++; )
+#endif
+    config_PRECISION sc = op->clover;
+    x += start; y += start;
+    // inverse diagonal blocks applied to the odd sites
+    if ( g.csw ) {
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+      PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start);
+      PRECISION *x_pt = (PRECISION*)x;
+      PRECISION *y_pt = (PRECISION*)y;
+      for ( int i=start; i<end; i+=12 ) {
+        sse_site_clover_PRECISION( y_pt, x_pt, sc_pt );
+        y_pt+=2*12; x_pt+=2*12; sc_pt+=2*2*36;
+      }
+#elif defined(HAVE_TM)
+      sc += (start/12)*72;
+      for ( int i=start; i<end; i+=12 ) {
+        LU_perform_fwd_bwd_subs_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=72;
+      }    
+#else
+      sc += (start/12)*42;
+      for ( int i=start; i<end; i+=12 ) {
+        LLH_perform_fwd_bwd_subs_PRECISION( y, x, sc );
+        y+=12; x+=12; sc+=42;
+      }
+#endif
+    } else {
+      sc += start;
+      for ( int i=start; i<end; i+=12 ) {
+        FOR12( *y = (*x)/(*sc); y++; x++; sc++; )
+      }
     }
+#ifdef HAVE_TM1p1
   }
+#endif
 }
 
 
@@ -438,13 +607,19 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
 *********************************************************************************/ 
   
   int j, k, k_e, k_o, n=l->num_inner_lattice_sites, oe_offset=0, mu, nu,
-      sc_size = 42, lu_dec_size = 42, bs, **bt = NULL,
+    sc_size = g.csw ? 42:12, lu_dec_size = 42, bs, **bt = NULL,
       *eot = NULL, *nt = NULL, *tt = NULL, t, z, y, x, le[4], N[4];
   config_double sc_in = in->clover, nc_in = in->D;
   config_PRECISION Aee = NULL, Aoo = NULL;
   operator_PRECISION_struct *op = &(l->oe_op_PRECISION);
 
+  op->m0 = in->m0;
+
 #ifdef HAVE_TM
+  op->mu = in->mu;
+  op->mu_even_shift = in->mu_even_shift;
+  op->mu_odd_shift = in->mu_odd_shift;
+
   lu_dec_size = 72;
   config_double tm_term_in = in->tm_term;
 #endif
@@ -491,51 +666,55 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
           for ( x=0; x<le[X]; x++ ) {
             if ( (t+z+y+x+oe_offset)%2 == 1 ) {
               // odd sites
+              /* TODO: fix the vectorized part
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
               PRECISION tmp[144] __attribute__((aligned(64)));
               sse_set_clover_PRECISION( tmp, sc_in );
 #ifdef HAVE_TM
-	      if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-		sse_add_diagonal_clover_PRECISION( tmp, tm_term_in );
+              if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+                sse_add_diagonal_clover_PRECISION( tmp, tm_term_in );
 #endif
               sse_site_clover_invert_PRECISION( tmp, Aoo_vectorized );
               Aoo_vectorized += 2*2*36;
 #endif
+              */
 #ifndef HAVE_TM
               selfcoupling_cholesky_decomposition_PRECISION( Aoo, sc_in );
 #else
-	      complex_double buffer[42];
-	      for(int i=0; i<42; i++)
-		buffer[i]=(complex_double) sc_in[i];
-	      for(int i=0; i<12; i++)
-		buffer[i]+=tm_term_in[i];
+              complex_double buffer[42];
+              for(int i=0; i<42; i++)
+                buffer[i] = sc_in[i];
+              for(int i=0; i<12; i++)
+                buffer[i] += tm_term_in[i];
               selfcoupling_LU_decomposition_PRECISION( Aoo, buffer );
 #endif
               Aoo += lu_dec_size;
             } else {
               // even sites
+              /*
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
               sse_set_clover_PRECISION( Aee_vectorized, sc_in );
 #ifdef HAVE_TM
-	      sse_add_diagonal_clover_PRECISION( Aee_vectorized, tm_term_in );
+              sse_add_diagonal_clover_PRECISION( Aee_vectorized, tm_term_in );
 #endif
               Aee_vectorized += 2*2*36;
 #endif
+              */
 #ifndef HAVE_TM
               selfcoupling_cholesky_decomposition_PRECISION( Aee, sc_in );
 #else
-	      complex_double buffer[42];
-	      for(int i=0; i<42; i++)
-		buffer[i]=(complex_double) sc_in[i];
-	      for(int i=0; i<12; i++)
-		buffer[i]+=tm_term_in[i];
+              complex_double buffer[42];
+              for(int i=0; i<42; i++)
+                buffer[i]=(complex_double) sc_in[i];
+              for(int i=0; i<12; i++)
+                buffer[i]+=(complex_double) tm_term_in[i];
               selfcoupling_LU_decomposition_PRECISION( Aee, buffer );
 #endif
               Aee += lu_dec_size;
             }
             sc_in += sc_size;
 #ifdef HAVE_TM
-	    tm_term_in += 12;
+            tm_term_in += 12;
 #endif
           }
   } else {
@@ -556,6 +735,118 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
             }
           }
   }
+
+#ifdef HAVE_TM1p1
+
+  int lu_doublet_dec_size = 288;
+  config_double eps_term_in = in->epsbar_term;
+  sc_in = in->clover; 
+#ifdef HAVE_TM
+  tm_term_in = in->tm_term;
+#endif
+  op->epsbar = in->epsbar;
+  op->epsbar_ig5_even_shift = in->epsbar_ig5_even_shift;
+  op->epsbar_ig5_odd_shift = in->epsbar_ig5_odd_shift;
+    
+  // re-order clover term (i.e., self coupling)
+  MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, lu_doublet_dec_size*n );
+  Aee = op->clover_doublet_oo_inv;
+  Aoo = op->clover_doublet_oo_inv + op->num_even_sites*lu_doublet_dec_size;
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+  MALLOC_HUGEPAGES( op->clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36, 4*SIMD_LENGTH_PRECISION );
+  MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, op->num_odd_sites*2*2*144, 4*SIMD_LENGTH_PRECISION );
+  PRECISION *Aee_vectorized = op->clover_doublet_vectorized;
+  PRECISION *Aoo_vectorized = op->clover_doublet_vectorized + op->num_even_sites*288;
+  PRECISION *Aoo_inverse_vectorized = op->clover_doublet_oo_inv_vectorized;
+#endif    
+  for ( t=0; t<le[T]; t++ )
+    for ( z=0; z<le[Z]; z++ )
+      for ( y=0; y<le[Y]; y++ )
+        for ( x=0; x<le[X]; x++ ) {
+          if ( (t+z+y+x+oe_offset)%2 == 1 ) {
+            // odd sites
+            /*
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+            sse_set_clover_doublet_PRECISION( Aoo_vectorized, sc_in );
+#ifdef HAVE_TM
+            if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+              sse_add_diagonal_clover_doublet_PRECISION( Aoo_vectorized, tm_term_in );
+#endif
+            complex_PRECISION eps_term[12];
+            for(int i=0; i<12; i++) {
+              eps_term[i] = eps_term_in[i];
+            }
+            sse_site_clover_doublet_invert_PRECISION( Aoo_vectorized, (config_PRECISION) eps_term, Aoo_inverse_vectorized );
+            Aoo_vectorized += 288;
+            Aoo_inverse_vectorized += 2*288;
+#endif
+            */
+            complex_double buffer[66];
+            if ( g.csw ) {
+              for(int i=0; i<12; i++) //0-23
+                buffer[i+12] = buffer[i] = (complex_double) sc_in[i];
+              for(int i=12; i<42; i++) //24-53
+                buffer[i+12] = (complex_double) sc_in[i];
+            } else {
+              for(int i=0; i<12; i++) //0-23
+                buffer[i+12] = buffer[i] = (complex_double) sc_in[i];
+              for(int i=12; i<42; i++) //24-53
+                buffer[i+12] = _COMPLEX_double_ZERO;
+            }              
+            for(int i=0; i<12; i++) //54-65
+              buffer[i+54] = (complex_double) eps_term_in[i];
+#ifdef HAVE_TM
+            if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+              for(int i=0; i<12; i++) {
+                buffer[i] += (complex_double) tm_term_in[i];
+                buffer[i+12] -= (complex_double) tm_term_in[i];
+              }
+#endif
+            selfcoupling_LU_doublet_decomposition_PRECISION( Aoo, buffer );
+            Aoo += lu_doublet_dec_size;
+          } else {
+            // even sites
+            /*
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+            sse_set_clover_doublet_PRECISION( Aee_vectorized, sc_in );
+#ifdef HAVE_TM
+            sse_add_diagonal_clover_doublet_PRECISION( Aee_vectorized, tm_term_in );
+#endif
+            Aee_vectorized += 288;
+#endif
+            */
+            complex_double buffer[66];
+            if ( g.csw ) {
+              for(int i=0; i<12; i++) //0-23
+                buffer[i+12] = buffer[i] = (complex_double) sc_in[i];
+              for(int i=12; i<42; i++) //24-53
+                buffer[i+12] = (complex_double) sc_in[i];
+            } else {
+              for(int i=0; i<12; i++) //0-23
+                buffer[i+12] = buffer[i] = (complex_double) sc_in[i];
+              for(int i=12; i<42; i++) //24-53
+                buffer[i+12] = _COMPLEX_double_ZERO;
+            }              
+            for(int i=0; i<12; i++) //54-65
+              buffer[i+54] = (complex_double) eps_term_in[i];
+#ifdef HAVE_TM
+            if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+              for(int i=0; i<12; i++) {
+                buffer[i] += (complex_double) tm_term_in[i];
+                buffer[i+12] -= (complex_double) tm_term_in[i];
+              }
+#endif
+            selfcoupling_LU_doublet_decomposition_PRECISION( Aee, buffer );
+            Aee += lu_doublet_dec_size;
+          }
+          sc_in += sc_size;
+          eps_term_in += 12;
+#ifdef HAVE_TM
+          tm_term_in += 12;
+#endif
+        }
+#endif
+  
   // re-order hopping term (i.e., nearest neighbor coupling)
   MALLOC( op->D, complex_PRECISION, 36*n );
   
@@ -576,7 +867,7 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
           k++;
         }
        
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // D_vectorized just used in the float environment
   MALLOC_HUGEPAGES( op->D_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
   MALLOC_HUGEPAGES( op->D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
   for ( int i=0; i<l->num_inner_lattice_sites; i++ ) {
@@ -621,13 +912,21 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) {
   define_eo_bt( bt, eot, op->c.num_even_boundary_sites, op->c.num_odd_boundary_sites, op->c.num_boundary_sites, N, l );
   
   j = (l->num_lattice_site_var/2)*l->num_lattice_sites;
+#ifdef HAVE_TM1p1
+  j *= 2;
+#endif
   MALLOC( op->prnT, complex_PRECISION, j*8 );
   op->prnZ = op->prnT + j; op->prnY = op->prnZ + j; op->prnX = op->prnY + j;
   op->prpT = op->prnX + j; op->prpZ = op->prpT + j; op->prpY = op->prpZ + j; op->prpX = op->prpY + j;  
   MALLOC( op->buffer, complex_PRECISION*, 2 );
   op->buffer[0] = NULL;
+#ifdef HAVE_TM1p1
+  MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size );
+  op->buffer[1] = op->buffer[0] + 2*l->vector_size;  
+#else
   MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size );
   op->buffer[1] = op->buffer[0] + l->vector_size;  
+#endif
   ghost_alloc_PRECISION( 0, &(op->c), l );
   ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ;
   l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var;
@@ -642,12 +941,16 @@ void oddeven_free_PRECISION( level_struct *l ) {
   lu_dec_size = 72;
 #endif
       
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
   FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, PRECISION, 2*4*l->inner_vector_size );
   FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size );
 #endif
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
   FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36 );
+#ifdef HAVE_TM1p1
+  FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36 );
+  FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_oo_inv_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*144 );
+#endif
 #endif
   
   ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l );
@@ -671,9 +974,18 @@ void oddeven_free_PRECISION( level_struct *l ) {
     l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL;
   }
   
+#ifdef HAVE_TM1p1
+  FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 4*l->vector_size );
+#else
   FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size );
+#endif
   FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 );
+#ifdef HAVE_TM1p1
+  FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, 2*(l->num_lattice_site_var/2)*l->num_lattice_sites*8 );
+  FREE( l->oe_op_PRECISION.clover_doublet_oo_inv, complex_PRECISION, 288*n );
+#else
   FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, (l->num_lattice_site_var/2)*l->num_lattice_sites*8 );
+#endif
 }
 
 
@@ -772,19 +1084,20 @@ void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, leve
   SYNC_CORES(threading)  
 }
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
 void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
                              const int amount, level_struct *l, struct Thread *threading ) {
-  
-  int start_even, end_even, start_odd, end_odd;
-  compute_core_start_end_custom(0, op->num_even_sites, &start_even, &end_even, l, threading, 1 );
-  compute_core_start_end_custom(op->num_even_sites, op->num_even_sites+op->num_odd_sites, &start_odd, &end_odd, l, threading, 1 );
 
-  int i, n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, *nb_pt,
-      start=0, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  complex_PRECISION pbuf[6];
-  vector_PRECISION phi_pt, eta_pt, end_pt;
-  config_PRECISION D_pt;
+  int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites,
+    *neighbor = op->neighbor_table, start=0, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
+  
+  SYNC_CORES(threading)  
+    
+  if ( amount == _EVEN_SITES || amount == _ODD_SITES ) {
+    compute_core_start_end_custom(0, op->num_even_sites, &start_even, &end_even, l, threading, 1 );
+    compute_core_start_end_custom(op->num_even_sites, op->num_even_sites+op->num_odd_sites, &start_odd, &end_odd, l, threading, 1 );
+  } else {
+    compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &n, l, threading, 1 );
+  }
   
   SYNC_CORES(threading)  
   
@@ -797,134 +1110,290 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato
     minus_dir_param = _EVEN_SITES;
     plus_dir_param = _ODD_SITES;
   }
-  // project in negative directions
-  for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) {
-    prp_T_PRECISION( op->prnT+i, phi_pt );
-    prp_Z_PRECISION( op->prnZ+i, phi_pt );
-    prp_Y_PRECISION( op->prnY+i, phi_pt );
-    prp_X_PRECISION( op->prnX+i, phi_pt );
-  }
-  // start communication in negative direction
-  START_LOCKED_MASTER(threading)
-  ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
-  END_LOCKED_MASTER(threading) 
-  // project plus dir and multiply with U dagger
-  for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=12 ) {
-    // T dir
-    i = 6*(*nb_pt); nb_pt++;
-    prn_T_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpT+i, D_pt, pbuf );
-    mvmh_PRECISION( op->prpT+i+3, D_pt, pbuf+3 ); D_pt += 9;
-    // Z dir
-    i = 6*(*nb_pt); nb_pt++;
-    prn_Z_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpZ+i, D_pt, pbuf );
-    mvmh_PRECISION( op->prpZ+i+3, D_pt, pbuf+3 ); D_pt += 9;
-    // Y dir
-    i = 6*(*nb_pt); nb_pt++;
-    prn_Y_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpY+i, D_pt, pbuf );
-    mvmh_PRECISION( op->prpY+i+3, D_pt, pbuf+3 ); D_pt += 9;
-    // X dir
-    i = 6*(*nb_pt); nb_pt++;
-    prn_X_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpX+i, D_pt, pbuf );
-    mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9;
-  }
-  if ( amount == _EVEN_SITES ) {
-    start = start_even, n = end_even;
-  } else if ( amount == _ODD_SITES ) {
-    start = start_odd, n = end_odd;
-  }  
-  // start communication in positive direction
-  START_LOCKED_MASTER(threading)
-  ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
-  // wait for communication in negative direction
-  ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
-  ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
-  ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
-  ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
-  END_LOCKED_MASTER(threading) 
-  // multiply with U and lift up minus dir
-  for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=12 ) {
-    // T dir
-    i = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnT+i );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnT+i+3 );
-    pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9;
-    // Z dir
-    i = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnZ+i );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnZ+i+3 );
-    pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9;
-    // Y dir
-    i = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnY+i );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnY+i+3 );
-    pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9;
-    // X dir
-    i = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnX+i );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 );
-    pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
-  }
-  // wait for communication in positive direction
-  START_LOCKED_MASTER(threading)
-  ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
-  ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
-  ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
-  ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
-  END_LOCKED_MASTER(threading) 
-  // lift up plus dir
-  for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) {
-    pbn_su3_T_PRECISION( op->prpT+i, eta_pt );
-    pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
-    pbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
-    pbn_su3_X_PRECISION( op->prpX+i, eta_pt );
-  }
 
-  SYNC_CORES(threading)
-}
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX };
+  complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX };
+#else
+  int i, *nb_pt;
+  vector_PRECISION phi_pt, eta_pt, end_pt;
+  config_PRECISION D_pt;
 #endif
-
-void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
-    level_struct *l, struct Thread *threading ) {
-
-/*********************************************************************************
-* Applies the Schur complement to a vector.
-*********************************************************************************/
-
-  // start and end indices for vector functions depending on thread
-  int start_even, end_even, start_odd, end_odd;
-  
-  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, 12 );
-  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, 12 );
-  
-  vector_PRECISION *tmp = op->buffer;
-  
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l );
-  vector_PRECISION_define( tmp[0], 0, start_even, end_even, l );
-  SYNC_CORES(threading)
-  PROF_PRECISION_START( _NC, threading );
-  
-  PROF_PRECISION_START( _SC, threading );
-  diag_ee_PRECISION( out, in, op, l, start_even, end_even );
-  SYNC_CORES(threading)
-  PROF_PRECISION_STOP( _SC, 1, threading );
-  hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
-  PROF_PRECISION_STOP( _NC, 0, threading );
   
-  PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd );
-  SYNC_CORES(threading)
-  PROF_PRECISION_STOP( _SC, 0, threading );
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    // project in negative directions
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    dprp_PRECISION( prn, phi, 24*start, 24*n );  
+#else
+    complex_PRECISION pbuf[12];
+    for ( i=12*start, phi_pt=phi+24*start; i<12*n; i+=12, phi_pt+=24 ) {
+      dprp_T_PRECISION( op->prnT+i, phi_pt );
+      dprp_Z_PRECISION( op->prnZ+i, phi_pt );
+      dprp_Y_PRECISION( op->prnY+i, phi_pt );
+      dprp_X_PRECISION( op->prnX+i, phi_pt );
+    }
+#endif
+    // start communication in negative direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
+    END_LOCKED_MASTER(threading) 
+    // project plus dir and multiply with U dagger
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    dprn_su3_PRECISION( prp, phi, op, neighbor, 24*start, 24*n );
+#else
+    for ( phi_pt=phi+24*start, end_pt=phi+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=24 ) {
+      // T dir
+      i = 12*(*nb_pt); nb_pt++;
+      dprn_T_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpT+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpT+i+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpT+i+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpT+i+9, D_pt, pbuf+9 ); D_pt += 9;
+      // Z dir
+      i = 12*(*nb_pt); nb_pt++;
+      dprn_Z_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpZ+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpZ+i+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpZ+i+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpZ+i+9, D_pt, pbuf+9 ); D_pt += 9;
+      // Y dir
+      i = 12*(*nb_pt); nb_pt++;
+      dprn_Y_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpY+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpY+i+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpY+i+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpY+i+9, D_pt, pbuf+9 ); D_pt += 9;
+      // X dir
+      i = 12*(*nb_pt); nb_pt++;
+      dprn_X_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpX+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 );
+      mvmh_PRECISION( op->prpX+i+6, D_pt, pbuf+6 );
+      mvmh_PRECISION( op->prpX+i+9, D_pt, pbuf+9 ); D_pt += 9;
+    }
+#endif
+    if ( amount == _EVEN_SITES ) {
+      start = start_even, n = end_even;
+    } else if ( amount == _ODD_SITES ) {
+      start = start_odd, n = end_odd;
+    }  
+    // start communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
+    // wait for communication in negative direction
+    ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
+    ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
+    ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
+    ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
+    END_LOCKED_MASTER(threading) 
+    // multiply with U and lift up minus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    su3_dpbp_PRECISION( eta, prn, op, neighbor, 24*start, 24*n );
+#else
+    for ( eta_pt=eta+24*start, end_pt=eta+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=24 ) {
+      // T dir
+      i = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnT+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnT+i+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnT+i+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnT+i+9 );
+      dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Z dir
+      i = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnZ+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnZ+i+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnZ+i+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnZ+i+9 );
+      dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Y dir
+      i = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnY+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnY+i+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnY+i+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnY+i+9 );
+      dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // X dir
+      i = 12*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnX+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 );
+      mvm_PRECISION( pbuf+6, D_pt, op->prnX+i+6 );
+      mvm_PRECISION( pbuf+9, D_pt, op->prnX+i+9 );
+      dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
+    }
+#endif
+    // wait for communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
+    ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
+    ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
+    ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
+    END_LOCKED_MASTER(threading) 
+    // lift up plus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    dpbn_PRECISION( eta, prp, 24*start, 24*n );
+#else
+    for ( i=12*start, eta_pt=eta+24*start; i<12*n; i+=12, eta_pt+=24 ) {
+      dpbn_su3_T_PRECISION( op->prpT+i, eta_pt );
+      dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
+      dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
+      dpbn_su3_X_PRECISION( op->prpX+i, eta_pt );
+    }
+#endif
+  } else {
+#endif
+    // project in negative directions
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    prp_PRECISION( prn, phi, 12*start, 12*n );  
+#else
+    complex_PRECISION pbuf[6];
+    for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) {
+      prp_T_PRECISION( op->prnT+i, phi_pt );
+      prp_Z_PRECISION( op->prnZ+i, phi_pt );
+      prp_Y_PRECISION( op->prnY+i, phi_pt );
+      prp_X_PRECISION( op->prnX+i, phi_pt );
+    }
+#endif
+    // start communication in negative direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
+    END_LOCKED_MASTER(threading) 
+    // project plus dir and multiply with U dagger
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n );
+#else
+    for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_pt<end_pt; phi_pt+=12 ) {
+      // T dir
+      i = 6*(*nb_pt); nb_pt++;
+      prn_T_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpT+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpT+i+3, D_pt, pbuf+3 ); D_pt += 9;
+      // Z dir
+      i = 6*(*nb_pt); nb_pt++;
+      prn_Z_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpZ+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpZ+i+3, D_pt, pbuf+3 ); D_pt += 9;
+      // Y dir
+      i = 6*(*nb_pt); nb_pt++;
+      prn_Y_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpY+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpY+i+3, D_pt, pbuf+3 ); D_pt += 9;
+      // X dir
+      i = 6*(*nb_pt); nb_pt++;
+      prn_X_PRECISION( pbuf, phi_pt );
+      mvmh_PRECISION( op->prpX+i, D_pt, pbuf );
+      mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9;
+    }
+#endif
+    if ( amount == _EVEN_SITES ) {
+      start = start_even, n = end_even;
+    } else if ( amount == _ODD_SITES ) {
+      start = start_odd, n = end_odd;
+    }  
+    // start communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
+    ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
+    // wait for communication in negative direction
+    ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
+    ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
+    ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
+    ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
+    END_LOCKED_MASTER(threading) 
+    // multiply with U and lift up minus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    su3_pbp_PRECISION( eta, prn, op, neighbor, 12*start, 12*n );
+#else
+    for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_pt<end_pt; eta_pt+=12 ) {
+      // T dir
+      i = 6*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnT+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnT+i+3 );
+      pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Z dir
+      i = 6*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnZ+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnZ+i+3 );
+      pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // Y dir
+      i = 6*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnY+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnY+i+3 );
+      pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9;
+      // X dir
+      i = 6*(*nb_pt); nb_pt++;
+      mvm_PRECISION( pbuf, D_pt, op->prnX+i );
+      mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 );
+      pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
+    }
+#endif
+    // wait for communication in positive direction
+    START_LOCKED_MASTER(threading)
+    ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
+    ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
+    ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
+    ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
+    END_LOCKED_MASTER(threading) 
+    // lift up plus dir
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+    pbn_PRECISION( eta, prp, 12*start, 12*n );
+#else
+    for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) {
+      pbn_su3_T_PRECISION( op->prpT+i, eta_pt );
+      pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
+      pbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
+      pbn_su3_X_PRECISION( op->prpX+i, eta_pt );
+    }
+#endif
+#ifdef HAVE_TM1p1
+  }
+#endif
+
+  SYNC_CORES(threading)
+}
+
+void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op,
+    level_struct *l, struct Thread *threading ) {
+
+/*********************************************************************************
+* Applies the Schur complement to a vector.
+*********************************************************************************/
+
+  // start and end indices for vector functions depending on thread
+  int start_even, end_even, start_odd, end_odd;
+  
+  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var );
+  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var );
+  
+  vector_PRECISION *tmp = op->buffer;
+  
+  SYNC_CORES(threading)
+  vector_PRECISION_define_zero( tmp[0], 0, l->inner_vector_size, l, threading );
+  SYNC_CORES(threading)
+  PROF_PRECISION_START( _NC, threading );
+  
+  PROF_PRECISION_START( _SC, threading );
+  diag_ee_PRECISION( out, in, op, l, start_even, end_even );
+  SYNC_CORES(threading)
+  PROF_PRECISION_STOP( _SC, 1, threading );
+  hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading );
+  PROF_PRECISION_STOP( _NC, 0, threading );
+  
+  PROF_PRECISION_START( _SC, threading );
+  diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd );
+  SYNC_CORES(threading)
+  PROF_PRECISION_STOP( _SC, 0, threading );
   PROF_PRECISION_START( _NC, threading );
   hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
@@ -935,19 +1404,18 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in
 void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
   // start and end indices for vector functions depending on thread
-  int start;
-  int end;
-  compute_core_start_end(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start, &end, l, threading);
+  int start=op->num_even_sites*l->num_lattice_site_var, end=l->inner_vector_size, thread_start, thread_end;
+  compute_core_start_end(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &thread_start, &thread_end, l, threading);
 
   vector_PRECISION tmp = op->buffer[0];
   
   // odd to even
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp, p->b, op, l, start, end );
+  diag_oo_inv_PRECISION( tmp, p->b, op, l, thread_start, thread_end );
   PROF_PRECISION_STOP( _SC, 0, threading );
-  SYNC_CORES(threading)
-  vector_PRECISION_scale( tmp, tmp, -1, start, end, l );
-  SYNC_CORES(threading)
+  SYNC_CORES(threading);
+  vector_PRECISION_scale( tmp, tmp, -1, op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l, threading );
+  SYNC_CORES(threading);
   PROF_PRECISION_START( _NC, threading );
   hopping_term_PRECISION( p->b, tmp, op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
@@ -956,20 +1424,20 @@ void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_stru
     fgmres_PRECISION( p, l, threading );
   else if ( g.method == 5 )
     bicgstab_PRECISION( p, l, threading );
-  diag_oo_inv_PRECISION( p->x, p->b, op, l, start, end );
+  diag_oo_inv_PRECISION( p->x, p->b, op, l, thread_start, thread_end );
   
   // even to odd
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp, 0, start, end, l );
+  SYNC_CORES(threading);
+  vector_PRECISION_define_zero( tmp, start, end, l, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
   hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( p->b, tmp, op, l, start, end );
+  diag_oo_inv_PRECISION( p->b, tmp, op, l, thread_start, thread_end );
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_minus( p->x, p->x, p->b, start, end, l );
+  vector_PRECISION_minus( p->x, p->x, p->b, thread_start, thread_end, l );
   SYNC_CORES(threading)
 }
 
@@ -1024,14 +1492,13 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO
 
   // start and end indices for vector functions depending on thread
   int start_even, end_even, start_odd, end_odd;
-  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, 12 );
-  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, 12 );
+  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var );
+  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var );
   
   vector_PRECISION *tmp = op->buffer;
   
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l );
-  vector_PRECISION_define( tmp[0], 0, start_even, end_even, l );
+  SYNC_CORES(threading);
+  vector_PRECISION_define_zero( tmp[0], 0, l->inner_vector_size, l, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
   
@@ -1059,70 +1526,77 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO
 
 void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
   
-  int start_even, end_even, start_odd, end_odd;
-  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, 12 );
-  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, 12 );
+  int start_even = 0, end_even = op->num_even_sites*l->num_lattice_site_var,
+    start_odd = end_even, end_odd = l->inner_vector_size;
+  int thread_start_even, thread_end_even, thread_start_odd, thread_end_odd;
+  compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &thread_start_even, &thread_end_even, l, threading, l->num_lattice_site_var );
+  compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &thread_start_odd, &thread_end_odd, l, threading, l->num_lattice_site_var );
 
   vector_PRECISION tmp = op->buffer[0];
   
   // odd to even
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( tmp, p->b, op, l, start_odd, end_odd );
+  diag_oo_inv_PRECISION( tmp, p->b, op, l, thread_start_odd, thread_end_odd );
   PROF_PRECISION_STOP( _SC, 0, threading );
   SYNC_CORES(threading)
-//   g5_PRECISION( tmp, tmp, start_odd, end_odd, l );
-//   vector_PRECISION_scale( tmp, tmp, -1, start_odd, end_odd, l );
-  minus_g5_PRECISION( tmp, tmp, start_odd, end_odd, l );
+  minus_g5_PRECISION( tmp, tmp, thread_start_odd, thread_end_odd, l );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
-  vector_PRECISION_define( p->x, 0, start_even, end_even, l );
+  vector_PRECISION_define_zero( p->x, start_even, end_even, l, threading );
   hopping_term_PRECISION( p->x, tmp, op, _EVEN_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 0, threading );
   SYNC_CORES(threading)
-  g5_PRECISION( p->x, p->x, start_even, end_even, l );
-  vector_PRECISION_plus( p->b, p->b, p->x, start_even, end_even, l );
+  g5_PRECISION( p->x, p->x, thread_start_even, thread_end_even, l );
+  vector_PRECISION_plus( p->b, p->b, p->x, thread_start_even, thread_end_even, l );
   SYNC_CORES(threading)
   
   ASSERT( g.method == 6 );
   fgmres_PRECISION( p, l, threading );
-  diag_oo_inv_PRECISION( p->x, p->b, op, l, start_odd, end_odd );
-  g5_PRECISION( p->x, p->x, start_odd, end_odd, l );
+  diag_oo_inv_PRECISION( p->x, p->b, op, l, thread_start_odd, thread_end_odd );
+  g5_PRECISION( p->x, p->x, thread_start_odd, thread_end_odd, l );
   
   // even to odd
-  SYNC_CORES(threading)
-  vector_PRECISION_define( tmp, 0, start_odd, end_odd, l );
+  SYNC_CORES(threading);
+  vector_PRECISION_define_zero( tmp, start_odd, end_odd, l, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _NC, threading );
   hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading );
   PROF_PRECISION_STOP( _NC, 1, threading );
   SYNC_CORES(threading)
   PROF_PRECISION_START( _SC, threading );
-  diag_oo_inv_PRECISION( p->b, tmp, op, l, start_odd, end_odd );
+  diag_oo_inv_PRECISION( p->b, tmp, op, l, thread_start_odd, thread_end_odd );
   PROF_PRECISION_STOP( _SC, 1, threading );
   SYNC_CORES(threading)
-  vector_PRECISION_minus( p->x, p->x, p->b, start_odd, end_odd, l );
+  vector_PRECISION_minus( p->x, p->x, p->b, thread_start_odd, thread_end_odd, l );
   SYNC_CORES(threading)
 }
 
 // ----- block odd even -----------------------------------------------------------
 
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struct *l ) {
+void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l ) {
   
-  config_PRECISION clover_pt = op->clover, oe_clover_pt = op->oe_clover;
-  complex_double buffer[42];
   int mu, i, d0, c0, b0, a0, d1, c1, b1, a1, t, z, y, x, agg_split[4], block_split[4], block_size[4];
+  operator_PRECISION_struct *op = &(s->op);
+  int n1 = s->num_block_even_sites;
 #ifdef HAVE_TM
   config_PRECISION tm_term_pt = op->tm_term;
 #endif
+
+  for ( mu=0; mu<4; mu++ ) {
+    agg_split[mu] = l->local_lattice[mu]/l->coarsening[mu];
+    block_split[mu] = l->coarsening[mu]/l->block_lattice[mu];
+    block_size[mu] = l->block_lattice[mu];
+  }
   
   if ( g.csw ) {
-    for ( mu=0; mu<4; mu++ ) {
-      agg_split[mu] = l->local_lattice[mu]/l->coarsening[mu];
-      block_split[mu] = l->coarsening[mu]/l->block_lattice[mu];
-      block_size[mu] = l->block_lattice[mu];
-    }
-    
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+    config_PRECISION clover_pt = op->clover, clover_oo_inv_pt = op->clover_oo_inv;
+    complex_double buffer[42];
+    int cs = 42;
+#else
+    PRECISION *clover_pt = op->clover_vectorized, *clover_oo_inv_pt = op->clover_oo_inv_vectorized;
+    int cs = 144;
+#endif    
     for ( d0=0; d0<agg_split[T]; d0++ )
       for ( c0=0; c0<agg_split[Z]; c0++ )
         for ( b0=0; b0<agg_split[Y]; b0++ )
@@ -1132,186 +1606,475 @@ void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struc
               for ( c1=c0*block_split[Z]; c1<(c0+1)*block_split[Z]; c1++ )
                 for ( b1=b0*block_split[Y]; b1<(b0+1)*block_split[Y]; b1++ )
                   for ( a1=a0*block_split[X]; a1<(a0+1)*block_split[X]; a1++ ) {
-                    
-                    for ( t=d1*block_size[T]; t<(d1+1)*block_size[T]; t++ )
-                      for ( z=c1*block_size[Z]; z<(c1+1)*block_size[Z]; z++ )
-                        for ( y=b1*block_size[Y]; y<(b1+1)*block_size[Y]; y++ )
-                          for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
-                            if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
-                                (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 0 ) {                      
-                              for ( i=0; i<42; i++ )
-                                oe_clover_pt[i] = clover_pt[i];
-                              clover_pt += 42;
+
+                    // skipping even sites
+                    clover_pt += n1*cs;
 #ifdef HAVE_TM
-                              if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-				for ( i=0; i<12; i++ )
-				  oe_clover_pt[i] += tm_term_pt[i];
-			      tm_term_pt += 12;
-                              oe_clover_pt += 72; 
-#else
-			      oe_clover_pt += 42;
+                    tm_term_pt += n1*12;
 #endif
-                            }
-                          }
                     for ( t=d1*block_size[T]; t<(d1+1)*block_size[T]; t++ )
                       for ( z=c1*block_size[Z]; z<(c1+1)*block_size[Z]; z++ )
                         for ( y=b1*block_size[Y]; y<(b1+1)*block_size[Y]; y++ )
                           for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
                             if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
                                 (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) {
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+
                               for ( i=0; i<42; i++ )
                                 buffer[i] = (complex_double)clover_pt[i];
 #ifdef HAVE_TM
-			      if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-				for ( i=0; i<12; i++ )
-				  buffer[i] += (complex_double)tm_term_pt[i];
-			      tm_term_pt += 12;
-                              selfcoupling_LU_decomposition_PRECISION( oe_clover_pt, buffer );
-                              oe_clover_pt += 72;
+                              if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+                                for ( i=0; i<12; i++ )
+                                  buffer[i] += (complex_double)tm_term_pt[i];
+                              tm_term_pt += 12;
+                              selfcoupling_LU_decomposition_PRECISION( clover_oo_inv_pt, buffer );
+                              clover_oo_inv_pt += 72;
 #else  
-                              selfcoupling_cholesky_decomposition_PRECISION( oe_clover_pt, buffer );
-                              oe_clover_pt += 42;
+                              selfcoupling_cholesky_decomposition_PRECISION( clover_oo_inv_pt, buffer );
+                              clover_oo_inv_pt += 42;
 #endif
-			      clover_pt += 42;
-			    }
+
+#else
+                              sse_site_clover_invert_PRECISION( clover_pt, clover_oo_inv_pt );
+                              clover_oo_inv_pt += 144;
+#endif
+                              clover_pt += cs;
+
+                            }
                           }
                   }
-  } else {
-    vector_PRECISION_copy( op->oe_clover, op->clover, 0, l->inner_vector_size, l );
+  }
+
+#ifdef HAVE_TM1p1
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+  complex_double buffer[66];
+  config_PRECISION clover_oo_inv_pt = op->clover_doublet_oo_inv, clover_pt = op->clover;
+  int cs = g.csw ? 42:12;
+#else
+  PRECISION *clover_pt = g.csw ? op->clover_doublet_vectorized:(PRECISION*)op->clover, *clover_oo_inv_pt = op->clover_doublet_oo_inv_vectorized;
+  int cs = g.csw ? 288:24;
+#endif
+  config_PRECISION eps_term_pt = op->epsbar_term;
 #ifdef HAVE_TM
-    if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-      vector_PRECISION_plus( op->oe_clover, op->oe_clover, op->tm_term, 0, l->inner_vector_size, l );
+  tm_term_pt = op->tm_term;
 #endif
-  }
-}
+  
+  for ( d0=0; d0<agg_split[T]; d0++ )
+    for ( c0=0; c0<agg_split[Z]; c0++ )
+      for ( b0=0; b0<agg_split[Y]; b0++ )
+        for ( a0=0; a0<agg_split[X]; a0++ )
+          
+          for ( d1=d0*block_split[T]; d1<(d0+1)*block_split[T]; d1++ )
+            for ( c1=c0*block_split[Z]; c1<(c0+1)*block_split[Z]; c1++ )
+              for ( b1=b0*block_split[Y]; b1<(b0+1)*block_split[Y]; b1++ )
+                for ( a1=a0*block_split[X]; a1<(a0+1)*block_split[X]; a1++ ) {
+
+                  // skipping even sites
+                  clover_pt += n1*cs;
+                  eps_term_pt += n1*12;
+#ifdef HAVE_TM
+                  tm_term_pt += n1*12;
 #endif
+                  for ( t=d1*block_size[T]; t<(d1+1)*block_size[T]; t++ )
+                    for ( z=c1*block_size[Z]; z<(c1+1)*block_size[Z]; z++ )
+                      for ( y=b1*block_size[Y]; y<(b1+1)*block_size[Y]; y++ )
+                        for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
+                          if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
+                               (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) {
 
 #ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+                            if ( g.csw ) {
+                              for( i=0; i<12; i++ ) //0-23
+                                buffer[i+12] = buffer[i] = (complex_double) clover_pt[i];
+                              for( i=12; i<42; i++ ) //24-53
+                                buffer[i+12] = (complex_double) clover_pt[i];
+                            } else {
+                              for( i=0; i<12; i++ ) //0-23
+                                buffer[i+12] = buffer[i] = (complex_double) clover_pt[i];
+                              for( i=12; i<42; i++ ) //24-53
+                                buffer[i+12] = _COMPLEX_double_ZERO;
+                            }              
+                            for( i=0; i<12; i++ ) //54-65
+                              buffer[i+54] = (complex_double) eps_term_pt[i];
+#ifdef HAVE_TM
+                            if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
+                              for(int i=0; i<12; i++) { //0-23
+                                buffer[i] += (complex_double) tm_term_pt[i];
+                                buffer[i+12] -= (complex_double) tm_term_pt[i];
+                              }
+                            tm_term_pt += 12;
+#endif
+                            eps_term_pt += 12;
+                            clover_pt += cs;
+                            selfcoupling_LU_doublet_decomposition_PRECISION( clover_oo_inv_pt, buffer );
+                            clover_oo_inv_pt += 288;
+#else
+                            if ( g.csw ) {
+                              sse_site_clover_doublet_invert_PRECISION( clover_pt, eps_term_pt, clover_oo_inv_pt );
+                            } else {
+#ifdef HAVE_TM
+                              for ( i=0; i<6; i++ ) { //we temporaly save in clover_oo_inv_pt
+                                clover_oo_inv_pt[2*i]    = clover_pt[2*i]   + creal_PRECISION(tm_term_pt[i]); 
+                                clover_oo_inv_pt[2*i+1]  = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); 
+                                clover_oo_inv_pt[2*i+12] = clover_pt[2*i]   - creal_PRECISION(tm_term_pt[i]); 
+                                clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); 
+                              }
+                              for ( i=6; i<12; i++ ) {
+                                clover_oo_inv_pt[2*i+12] = clover_pt[2*i]   + creal_PRECISION(tm_term_pt[i]); 
+                                clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1] + cimag_PRECISION(tm_term_pt[i]); 
+                                clover_oo_inv_pt[2*i+24] = clover_pt[2*i]   - creal_PRECISION(tm_term_pt[i]); 
+                                clover_oo_inv_pt[2*i+25] = clover_pt[2*i+1] - cimag_PRECISION(tm_term_pt[i]); 
+                              }
+                              tm_term_pt += 12;
+#else
+                              for ( i=0; i<6; i++ ) {
+                                clover_oo_inv_pt[2*i+12] = clover_oo_inv_pt[2*i]   = clover_pt[2*i]; 
+                                clover_oo_inv_pt[2*i+13] = clover_oo_inv_pt[2*i+1] = clover_pt[2*i+1]; 
+                              }
+                              for ( i=6; i<12; i++ ) {
+                                clover_oo_inv_pt[2*i+24] = clover_oo_inv_pt[2*i+12] = clover_pt[2*i]; 
+                                clover_oo_inv_pt[2*i+25] = clover_oo_inv_pt[2*i+13] = clover_pt[2*i+1]; 
+                              }
+#endif
+                              sse_site_clover_doublet_invert_PRECISION( clover_oo_inv_pt, eps_term_pt, clover_oo_inv_pt );
+                            } 
+                            
+                            clover_pt += cs;
+                            eps_term_pt += 12;
+                            clover_oo_inv_pt += 2*288;
+#endif
+                          }
+                        }
+                }
+#endif  
+}
+
 void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)  
-  int n1 = s->num_block_even_sites;
-  config_PRECISION clover = (g.csw==0.0)?s->op.clover+start:s->op.clover+(start/12)*42;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
+  int n1 = s->num_block_even_sites, nv = l->num_lattice_site_var;
+  clover_PRECISION( eta, phi, &(s->op), start, start+nv*n1, l, threading ); 
 
-  // diagonal blocks applied to the even sites of a block
-  clover_PRECISION( leta, lphi, clover, 12*n1, l, no_threading ); 
-#ifdef HAVE_TM
-  config_PRECISION tm_term = s->op.tm_term+start;
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
-    add_diagonal_PRECISION( leta, lphi, tm_term, 12*n1 );
-#endif
-  
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
-
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+                      
+// diagonal blocks applied to the odd sites of a block
 void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
 
-  int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites;
-#ifndef HAVE_TM
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42;
+
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+  //we don't have the LU decomposition here, for debugging only  
+  int n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites, nv = l->num_lattice_site_var;
+  clover_PRECISION( eta, phi, &(s->op), start+nv*n1, start+nv*(n1+n2), l, threading ); 
+
 #else
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*72;
+
+  int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites;
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    int block_num = start/24/(n1+n2);
+    //    config_PRECISION clover = s->op.clover_doublet_oo_inv+n1*288+(start/24)*288;
+    config_PRECISION clover = s->op.clover_doublet_oo_inv+(start/24-block_num*n1)*288;
+    vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start;
+    for ( i=0; i<n2; i++ ) {
+      LU_multiply_PRECISION( leta, lphi, clover );
+      leta+=24; lphi+=24; clover+=288;
+    }
+  } else {
 #endif
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  // diagonal blocks applied to the odd sites of a block
-  if ( g.csw ) {
+    vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start;
+    if ( g.csw ) {
+      int block_num = start/12/(n1+n2);
 #ifndef HAVE_TM
-    leta += n1*12; lphi += n1*12; clover += n1*42;
+      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42;
+      for ( i=0; i<n2; i++ ) {
+        LLH_multiply_PRECISION( leta, lphi, clover );
+        leta+=12; lphi+=12; clover+=42;
+      }
 #else
-    leta += n1*12; lphi += n1*12; clover += n1*72;
+      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72;
+      for ( i=0; i<n2; i++ ) {
+        LU_multiply_PRECISION( leta, lphi, clover );
+        leta+=12; lphi+=12; clover+=72;
+      }
 #endif
-    for ( i=0; i<n2; i++ ) {
+    } else {
+      config_PRECISION clover = s->op.clover+n1*12+start;
 #ifndef HAVE_TM
-      LLH_multiply_PRECISION( leta, lphi, clover );
-      leta+=12; lphi+=12; clover+=42;
+      for ( i=0; i<12*n2; i++ )
+        leta[i] = lphi[i]*(clover[i]);
 #else
-      LU_multiply_PRECISION( leta, lphi, clover );
-      leta+=12; lphi+=12; clover+=72;
+      config_PRECISION tm_term = s->op.tm_term+n1*12+start;
+      for ( i=0; i<12*n2; i++ )
+        leta[i] = lphi[i]*(clover[i]+tm_term[i]);
 #endif
     }
-  } else {
-    leta += n1*12; lphi += n1*12; clover += n1*12;
-    for ( i=0; i<12*n2; i++ )
-      leta[i] = lphi[i]*clover[i];
+#ifdef HAVE_TM1p1
   }
+#endif  
 
+#endif
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
 
-#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
-void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-    int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
+// inverted diagonal blocks applied to the odd sites of a block
+void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s,
+                                  level_struct *l, struct Thread *threading ) {
 
   START_UNTHREADED_FUNCTION(threading)
   
   int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites;
-#ifndef HAVE_TM
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42;
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+
+    vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start;
+    int block_num = start/24/(n1+n2);
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+    config_PRECISION clover = s->op.clover_doublet_oo_inv + (start/24-block_num*n1)*288;
+    for ( i=0; i<n2; i++ ) {
+      LU_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
+      leta+=24; lphi+=24; clover+=288;
+    }
 #else
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*72;
+    PRECISION *clover_vectorized = s->op.clover_doublet_oo_inv_vectorized + (start/24-block_num*n1)*2*288;
+    for ( i=0; i<n2; i++ ) {
+      sse_site_clover_doublet_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover_vectorized );
+      leta+=24; lphi+=24; clover_vectorized+=2*288;
+    }
 #endif
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  // inverted diagonal blocks applied to the odd sites of a block
-  if ( g.csw ) {
+
+  } else {
+#endif
+
+    vector_PRECISION lphi = phi+n1*12+start, leta = eta+n1*12+start;
+    if ( g.csw ) {
+      int block_num = start/12/(n1+n2);
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
 #ifndef HAVE_TM
-    leta += n1*12; lphi += n1*12; clover += n1*42;
+      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*42;
+      for ( i=0; i<n2; i++ ) {
+        LLH_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
+        leta+=12; lphi+=12; clover+=42;
+      }
+#else
+      config_PRECISION clover = s->op.clover_oo_inv+(start/12-block_num*n1)*72;
+      for ( i=0; i<n2; i++ ) {
+        LU_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
+        leta+=12; lphi+=12; clover+=72;
+      }
+#endif
 #else
-    leta += n1*12; lphi += n1*12; clover += n1*72;
+      PRECISION *clover_vectorized = s->op.clover_oo_inv_vectorized + (start/12-block_num*n1)*144;
+      for ( i=0; i<n2; i++ ) {
+        sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover_vectorized );
+        leta+=12; lphi+=12; clover_vectorized+=144;
+      }      
 #endif
-    for ( i=0; i<n2; i++ ) {
+    } else {
+      config_PRECISION clover = s->op.clover+n1*12+start;
 #ifndef HAVE_TM
-      LLH_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
-      leta+=12; lphi+=12; clover+=42;
+      for ( i=0; i<12*n2; i++ )
+        leta[i] = lphi[i]/(clover[i]);
 #else
-      LU_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover );
-      leta+=12; lphi+=12; clover+=72;
+      config_PRECISION tm_term = s->op.tm_term+n1*12+start;
+      for ( i=0; i<12*n2; i++ )
+        leta[i] = lphi[i]/(clover[i]+tm_term[i]);
 #endif
     }
-  } else {
-    leta += n1*12; lphi += n1*12; clover += n1*12;
-    for ( i=0; i<12*n2; i++ )
-      leta[i] = lphi[i]/clover[i];
+#ifdef HAVE_TM1p1
   }
-
+#endif
+  
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
 
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
 void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
     int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
 
   int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
-      **index = s->oe_index, *ind, *neighbor = s->op.neighbor_table;
-  config_PRECISION D = s->op.D + (start/12)*36;
-  int i, j, k;
-  complex_PRECISION buf1[13] = {0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2 = buf1+6;
+      **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var;
+  
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
+  PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96;
+  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96;
+
+  for ( int mu=0; mu<4; mu++ ) {
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[mu];
+      a2 = n1; n2 = a2 + length_odd[mu];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[mu]; n1 = a1 + length_odd[mu];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[mu]+length_odd[mu];
+      a2 = 0; n2 = n1;
+    }
+    block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor );
+    block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor );
+  }
+
+#else
+  config_PRECISION D = s->op.D + (start/nv)*36;
+  int i, j, k, *ind;
   config_PRECISION D_pt; 
   vector_PRECISION lphi = phi+start, leta = eta+start;
+
+#ifdef HAVE_TM1p1  
+  if ( g.n_flavours == 2 ) {
+    complex_PRECISION buf1[25] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2 = buf1+12;
+    // T direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[T];
+      a2 = n1; n2 = a2 + length_odd[T];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[T]; n1 = a1 + length_odd[T];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[T]+length_odd[T];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +T coupling
+    ind = index[T];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      dprp_T_PRECISION( buf1, lphi+24*j );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_T_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -T coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      dprn_T_PRECISION( buf1, lphi+24*k );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_T_PRECISION( buf2, leta+24*j );
+    }
   
+    // Z direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[Z];
+      a2 = n1; n2 = a2 + length_odd[Z];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[Z]; n1 = a1 + length_odd[Z];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[Z]+length_odd[Z];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +Z coupling
+    ind = index[Z];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      dprp_Z_PRECISION( buf1, lphi+24*j );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Z_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -Z coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      dprn_Z_PRECISION( buf1, lphi+24*k );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Z_PRECISION( buf2, leta+24*j );
+    }
   
-  // T direction
-  if ( amount == _EVEN_SITES ) {
-    a1 = 0; n1 = length_even[T];
-    a2 = n1; n2 = a2 + length_odd[T];
-  } else if ( amount == _ODD_SITES ) {
-    a1 = length_even[T]; n1 = a1 + length_odd[T];
-    a2 = 0; n2 = a1;
+    // Y direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[Y];
+      a2 = n1; n2 = a2 + length_odd[Y];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[Y]; n1 = a1 + length_odd[Y];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[Y]+length_odd[Y];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +Y coupling
+    ind = index[Y];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      dprp_Y_PRECISION( buf1, lphi+24*j );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Y_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -Y coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      dprn_Y_PRECISION( buf1, lphi+24*k );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Y_PRECISION( buf2, leta+24*j );
+    }
+    
+    // X direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[X];
+      a2 = n1; n2 = a2 + length_odd[X];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[X]; n1 = a1 + length_odd[X];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[X]+length_odd[X];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +X coupling
+    ind = index[X];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      dprp_X_PRECISION( buf1, lphi+24*j );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_X_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -X coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      dprn_X_PRECISION( buf1, lphi+24*k );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_X_PRECISION( buf2, leta+24*j );
+    }
   } else {
-    a1 = 0; n1 = length_even[T]+length_odd[T];
-    a2 = 0; n2 = n1;
-  }
+#endif
+    complex_PRECISION buf1[13] = {0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2 = buf1+6;
+    // T direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[T];
+      a2 = n1; n2 = a2 + length_odd[T];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[T]; n1 = a1 + length_odd[T];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[T]+length_odd[T];
+      a2 = 0; n2 = n1;
+    }
   // "amount" of a block, +T coupling
   ind = index[T];
   for ( i=a1; i<n1; i++ ) {
@@ -1416,142 +2179,306 @@ void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
     mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
     pbn_su3_X_PRECISION( buf2, leta+12*j );
   }
-
+#ifdef HAVE_TM1p1
+  }
+#endif
+#endif
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+
 void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
     int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
   
   START_UNTHREADED_FUNCTION(threading)
 
-  int i, j, k, a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
-      **index = s->oe_index, *ind, *neighbor = s->op.neighbor_table;
-  complex_PRECISION buf1[12], *buf2 = buf1+6;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  config_PRECISION D_pt, D = s->op.D + (start/12)*36;
-  
-  // T direction
-  if ( amount == _EVEN_SITES ) {
-    a1 = 0; n1 = length_even[T];
-    a2 = n1; n2 = a2 + length_odd[T];
-  } else if ( amount == _ODD_SITES ) {
-    a1 = length_even[T]; n1 = a1 + length_odd[T];
-    a2 = 0; n2 = a1;
-  } else {
-    a1 = 0; n1 = length_even[T]+length_odd[T];
-    a2 = 0; n2 = n1;
-  }
-  // "amount" of a block, +T coupling
-  ind = index[T];
-  for ( i=a1; i<n1; i++ ) {
-    k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
-    prp_T_PRECISION( buf1, lphi+12*j );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_T_PRECISION( buf2, leta+12*k );
-  }
-  // "amount" of a block, -T coupling
-  for ( i=a2; i<n2; i++ ) {
-    k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
-    prn_T_PRECISION( buf1, lphi+12*k );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_T_PRECISION( buf2, leta+12*j );
-  }
-  
-  // Z direction
-  if ( amount == _EVEN_SITES ) {
-    a1 = 0; n1 = length_even[Z];
-    a2 = n1; n2 = a2 + length_odd[Z];
-  } else if ( amount == _ODD_SITES ) {
-    a1 = length_even[Z]; n1 = a1 + length_odd[Z];
-    a2 = 0; n2 = a1;
-  } else {
-    a1 = 0; n1 = length_even[Z]+length_odd[Z];
-    a2 = 0; n2 = n1;
-  }
-  // "amount" of a block, +Z coupling
-  ind = index[Z];
-  for ( i=a1; i<n1; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
-    prp_Z_PRECISION( buf1, lphi+12*j );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_Z_PRECISION( buf2, leta+12*k );
-  }
-  // "amount" of a block, -Z coupling
-  for ( i=a2; i<n2; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
-    prn_Z_PRECISION( buf1, lphi+12*k );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_Z_PRECISION( buf2, leta+12*j );
-  }
-  
-  // Y direction
-  if ( amount == _EVEN_SITES ) {
-    a1 = 0; n1 = length_even[Y];
-    a2 = n1; n2 = a2 + length_odd[Y];
-  } else if ( amount == _ODD_SITES ) {
-    a1 = length_even[Y]; n1 = a1 + length_odd[Y];
-    a2 = 0; n2 = a1;
-  } else {
-    a1 = 0; n1 = length_even[Y]+length_odd[Y];
-    a2 = 0; n2 = n1;
-  }
-  // "amount" of a block, +Y coupling
-  ind = index[Y];
-  for ( i=a1; i<n1; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
-    prp_Y_PRECISION( buf1, lphi+12*j );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_Y_PRECISION( buf2, leta+12*k );
-  }
-  // "amount" of a block, -Y coupling
-  for ( i=a2; i<n2; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
-    prn_Y_PRECISION( buf1, lphi+12*k );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_Y_PRECISION( buf2, leta+12*j );
+  int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
+      **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var;
+
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
+  PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96;
+  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96;
+
+  for ( int mu=0; mu<4; mu++ ) {
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[mu];
+      a2 = n1; n2 = a2 + length_odd[mu];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[mu]; n1 = a1 + length_odd[mu];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[mu]+length_odd[mu];
+      a2 = 0; n2 = n1;
+    }
+    block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor );
+    block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor );
   }
-  
-  // X direction
-  if ( amount == _EVEN_SITES ) {
-    a1 = 0; n1 = length_even[X];
-    a2 = n1; n2 = a2 + length_odd[X];
-  } else if ( amount == _ODD_SITES ) {
-    a1 = length_even[X]; n1 = a1 + length_odd[X];
-    a2 = 0; n2 = a1;
+
+#else
+  int i, j, k, *ind;
+  vector_PRECISION lphi = phi+start, leta = eta+start;
+  config_PRECISION D_pt, D = s->op.D + (start/nv)*36;
+
+#ifdef HAVE_TM1p1
+  if ( g.n_flavours == 2 ) {
+    complex_PRECISION buf1[24], *buf2 = buf1+12;
+    // T direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[T];
+      a2 = n1; n2 = a2 + length_odd[T];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[T]; n1 = a1 + length_odd[T];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[T]+length_odd[T];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +T coupling
+    ind = index[T];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      dprp_T_PRECISION( buf1, lphi+24*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_T_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -T coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      dprn_T_PRECISION( buf1, lphi+24*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_T_PRECISION( buf2, leta+24*j );
+    }
+    
+    // Z direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[Z];
+      a2 = n1; n2 = a2 + length_odd[Z];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[Z]; n1 = a1 + length_odd[Z];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[Z]+length_odd[Z];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +Z coupling
+    ind = index[Z];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      dprp_Z_PRECISION( buf1, lphi+24*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Z_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -Z coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      dprn_Z_PRECISION( buf1, lphi+24*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Z_PRECISION( buf2, leta+24*j );
+    }
+    
+    // Y direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[Y];
+      a2 = n1; n2 = a2 + length_odd[Y];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[Y]; n1 = a1 + length_odd[Y];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[Y]+length_odd[Y];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +Y coupling
+    ind = index[Y];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      dprp_Y_PRECISION( buf1, lphi+24*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Y_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -Y coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      dprn_Y_PRECISION( buf1, lphi+24*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Y_PRECISION( buf2, leta+24*j );
+    }
+    
+    // X direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[X];
+      a2 = n1; n2 = a2 + length_odd[X];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[X]; n1 = a1 + length_odd[X];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[X]+length_odd[X];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +X coupling
+    ind = index[X];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      dprp_X_PRECISION( buf1, lphi+24*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_X_PRECISION( buf2, leta+24*k );
+    }
+    // "amount" of a block, -X coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      dprn_X_PRECISION( buf1, lphi+24*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_X_PRECISION( buf2, leta+24*j );
+    } 
   } else {
-    a1 = 0; n1 = length_even[X]+length_odd[X];
-    a2 = 0; n2 = n1;
-  }
-  // "amount" of a block, +X coupling
-  ind = index[X];
-  for ( i=a1; i<n1; i++ ) {
-    k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
-    prp_X_PRECISION( buf1, lphi+12*j );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_X_PRECISION( buf2, leta+12*k );
-  }
-  // "amount" of a block, -X coupling
-  for ( i=a2; i<n2; i++ ) {
-    k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
-    prn_X_PRECISION( buf1, lphi+12*k );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_X_PRECISION( buf2, leta+12*j );
+#endif
+    complex_PRECISION buf1[12], *buf2 = buf1+6;
+    
+    // T direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[T];
+      a2 = n1; n2 = a2 + length_odd[T];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[T]; n1 = a1 + length_odd[T];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[T]+length_odd[T];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +T coupling
+    ind = index[T];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      prp_T_PRECISION( buf1, lphi+12*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_T_PRECISION( buf2, leta+12*k );
+    }
+    // "amount" of a block, -T coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
+      prn_T_PRECISION( buf1, lphi+12*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_T_PRECISION( buf2, leta+12*j );
+    }
+    
+    // Z direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[Z];
+      a2 = n1; n2 = a2 + length_odd[Z];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[Z]; n1 = a1 + length_odd[Z];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[Z]+length_odd[Z];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +Z coupling
+    ind = index[Z];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      prp_Z_PRECISION( buf1, lphi+12*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_Z_PRECISION( buf2, leta+12*k );
+    }
+    // "amount" of a block, -Z coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
+      prn_Z_PRECISION( buf1, lphi+12*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_Z_PRECISION( buf2, leta+12*j );
+    }
+    
+    // Y direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[Y];
+      a2 = n1; n2 = a2 + length_odd[Y];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[Y]; n1 = a1 + length_odd[Y];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[Y]+length_odd[Y];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +Y coupling
+    ind = index[Y];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      prp_Y_PRECISION( buf1, lphi+12*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_Y_PRECISION( buf2, leta+12*k );
+    }
+    // "amount" of a block, -Y coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
+      prn_Y_PRECISION( buf1, lphi+12*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_Y_PRECISION( buf2, leta+12*j );
+    }
+    
+    // X direction
+    if ( amount == _EVEN_SITES ) {
+      a1 = 0; n1 = length_even[X];
+      a2 = n1; n2 = a2 + length_odd[X];
+    } else if ( amount == _ODD_SITES ) {
+      a1 = length_even[X]; n1 = a1 + length_odd[X];
+      a2 = 0; n2 = a1;
+    } else {
+      a1 = 0; n1 = length_even[X]+length_odd[X];
+      a2 = 0; n2 = n1;
+    }
+    // "amount" of a block, +X coupling
+    ind = index[X];
+    for ( i=a1; i<n1; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      prp_X_PRECISION( buf1, lphi+12*j );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_X_PRECISION( buf2, leta+12*k );
+    }
+    // "amount" of a block, -X coupling
+    for ( i=a2; i<n2; i++ ) {
+      k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
+      prn_X_PRECISION( buf1, lphi+12*k );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_X_PRECISION( buf2, leta+12*j );
+    }
+#ifdef HAVE_TM1p1
   }
-
+#endif
+#endif      
   END_UNTHREADED_FUNCTION(threading)
 }
-#endif
+
 
 void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start,
     schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
@@ -1559,9 +2486,7 @@ void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECIS
   vector_PRECISION *tmp = s->oe_buf;
   
   block_diag_ee_PRECISION( out, in, start, s, l, threading );
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_define( tmp[0], 0, start + 12*s->num_block_even_sites, start + s->block_vector_size, l );
-  END_LOCKED_MASTER(threading)
+  vector_PRECISION_define_zero( tmp[0], start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l, threading );
   block_hopping_term_PRECISION( tmp[0], in, start, _ODD_SITES, s, l, threading );
   block_diag_oo_inv_PRECISION( tmp[1], tmp[0], start, s, l, threading );
   block_n_hopping_term_PRECISION( out, tmp[1], start, _EVEN_SITES, s, l, threading );
@@ -1578,12 +2503,11 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve
   
   // odd to even
   vector_PRECISION_copy( tmp[3], r, start, end, l );
-  
   block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading );
   block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _EVEN_SITES, s, l, no_threading );
   
   local_minres_PRECISION( NULL, tmp[3], tmp[2], start, s, l, no_threading );
-  
+    
   // even to odd
   block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _ODD_SITES, s, l, no_threading );
   block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading );
@@ -1592,45 +2516,50 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve
   vector_PRECISION_copy( latest_iter, tmp[2], start, end, l );
   vector_PRECISION_plus( phi, phi, tmp[2], start, end, l );
   // update r
-  vector_PRECISION_copy( r, tmp[3], start, start+12*s->num_block_even_sites, l );
-  vector_PRECISION_define( r, 0, start+12*s->num_block_even_sites, end, l );
+  vector_PRECISION_copy( r, tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l );
+  vector_PRECISION_define_zero( r, start+l->num_lattice_site_var*s->num_block_even_sites, end, l, no_threading );
 
   END_UNTHREADED_FUNCTION(threading)
 }
 
 
 void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) {
-#if !defined( OPTIMIZED_NEIGHBOR_COUPLING_PRECISION ) && !defined( OPTIMIZED_SELF_COUPLING_PRECISION ) 
   START_UNTHREADED_FUNCTION(threading)
 
   schwarz_PRECISION_struct *s = &(l->s_PRECISION);
   
   vector_PRECISION b1 = NULL, b2 = NULL, b3 = NULL, b4 = NULL, b5 = NULL;
-  MALLOC( b1, complex_PRECISION, s->block_vector_size );
-  MALLOC( b2, complex_PRECISION, s->block_vector_size );
-  MALLOC( b3, complex_PRECISION, s->block_vector_size );
-  MALLOC( b4, complex_PRECISION, s->block_vector_size );
-  MALLOC( b5, complex_PRECISION, s->block_vector_size );
-  
-  vector_PRECISION_define_random( b1, 0, s->block_vector_size, l );
-  
-  block_diag_ee_PRECISION( b2, b1, 0, s, l, no_threading );
-  block_diag_oo_PRECISION( b2, b1, 0, s, l, no_threading );
-  block_hopping_term_PRECISION( b2, b1, 0, _FULL_SYSTEM, s, l, no_threading );
+  PRECISION diff;
+
+  int vs = s->block_vector_size * s->num_blocks;
+
+  MALLOC( b1, complex_PRECISION, vs );
+  MALLOC( b2, complex_PRECISION, vs );
+  MALLOC( b3, complex_PRECISION, vs );
+  MALLOC( b4, complex_PRECISION, vs );
+  MALLOC( b5, complex_PRECISION, vs );
   
-  block_d_plus_clover_PRECISION( b3, b1, 0, s, l, no_threading );
+  vector_PRECISION_define_random( b1, 0, vs, l, no_threading );
+
+  for (int i = 0; i< s->num_blocks; i++ ) {
+    block_diag_ee_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+    block_diag_oo_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+    block_hopping_term_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading );
+    
+    block_d_plus_clover_PRECISION( b3, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
+  }
   
-  vector_PRECISION_minus( b3, b3, b2, 0, s->block_vector_size, l );
-  double diff = process_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / process_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading );
+  vector_PRECISION_minus( b3, b3, b2, 0, vs, l );
+  diff = global_norm_PRECISION( b3, 0, vs, l, no_threading ) / global_norm_PRECISION( b2, 0, vs, l, no_threading );
   
-  printf0("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff );
+  test0_PRECISION("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff );
   
   vector_PRECISION_copy( b4, b1, 0, s->block_vector_size, l );
-  vector_PRECISION_define( b3, 0, 12*s->num_block_even_sites, s->block_vector_size, l );
+  vector_PRECISION_define_zero( b3, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l, no_threading );
   
   block_hopping_term_PRECISION( b3, b4, 0, _ODD_SITES, s, l, no_threading );
   block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading );
-  vector_PRECISION_plus( b4, b4, b5, 12*s->num_block_even_sites, s->block_vector_size, l );
+  vector_PRECISION_plus( b4, b4, b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l );
   
   apply_block_schur_complement_PRECISION( b3, b4, 0, s, l, no_threading );
   block_diag_oo_PRECISION( b3, b4, 0, s, l, no_threading );
@@ -1639,21 +2568,19 @@ void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) {
   block_hopping_term_PRECISION( b3, b5, 0, _EVEN_SITES, s, l, no_threading );
   
   vector_PRECISION_minus( b3, b2, b3, 0, s->block_vector_size, l );
-  diff = process_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / process_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading );
+  diff = global_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading );
   
-  printf0("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff );
+  test0_PRECISION("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff );
   
-  FREE( b1, complex_PRECISION, s->block_vector_size );
-  FREE( b2, complex_PRECISION, s->block_vector_size );
-  FREE( b3, complex_PRECISION, s->block_vector_size );
-  FREE( b4, complex_PRECISION, s->block_vector_size );
-  FREE( b5, complex_PRECISION, s->block_vector_size );
+  FREE( b1, complex_PRECISION, vs );
+  FREE( b2, complex_PRECISION, vs );
+  FREE( b3, complex_PRECISION, vs );
+  FREE( b4, complex_PRECISION, vs );
+  FREE( b5, complex_PRECISION, vs );
 
   END_UNTHREADED_FUNCTION(threading)
-#endif
 }
 
-
 void oddeven_PRECISION_test( level_struct *l ) {
 
 /*********************************************************************************
@@ -1666,7 +2593,7 @@ void oddeven_PRECISION_test( level_struct *l ) {
   
   vector_double d1=NULL, d2=NULL, d3=NULL;
   vector_PRECISION f1=NULL, f2=NULL, f3=NULL, f4=NULL, f5=NULL;
-  double norm;
+  double diff;
   
   MALLOC( d1, complex_double, l->inner_vector_size );
   MALLOC( d2, complex_double, l->inner_vector_size );
@@ -1677,7 +2604,7 @@ void oddeven_PRECISION_test( level_struct *l ) {
   MALLOC( f4, complex_PRECISION, l->inner_vector_size );
   MALLOC( f5, complex_PRECISION, l->inner_vector_size );
   
-  vector_double_define_random( d1, 0, l->inner_vector_size, l ); 
+  vector_double_define_random( d1, 0, l->inner_vector_size, l, no_threading ); 
   serial_to_oddeven_PRECISION( f1, d1, l, no_threading );
    
   diag_ee_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var );
@@ -1689,10 +2616,9 @@ void oddeven_PRECISION_test( level_struct *l ) {
   oddeven_to_serial_PRECISION( d1, f2, l, no_threading );
   
   vector_double_minus( d3, d1, d2, 0, l->num_inner_lattice_sites, l );
-  norm = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading );
+  diff = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading );
   
-  printf0("depth: %d, correctness of odd even layout: %le\n", l->depth, norm );
-  if(norm > g.test) g.test = norm;
+  test0_PRECISION("depth: %d, correctness of odd even layout: %le\n", l->depth, diff );
     
   // --------------
   
@@ -1701,16 +2627,15 @@ void oddeven_PRECISION_test( level_struct *l ) {
   diag_oo_inv_PRECISION( f4, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
   vector_PRECISION_minus( f4, f4, f1, 0, l->inner_vector_size, l );
 
-  norm = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading ));
+  diff = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading ));
   
-  printf0("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, norm );
-  if(norm > g.test) g.test = norm;
+  test0_PRECISION("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, diff );
     
   // transformation part
   vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l );
   // even to odd
   // set odd part of f3 to 0. 
-  vector_PRECISION_define( f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l );
+  vector_PRECISION_define_zero( f3, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l, no_threading );
   
   hopping_term_PRECISION( f3, f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading );
   diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size );
@@ -1724,10 +2649,9 @@ void oddeven_PRECISION_test( level_struct *l ) {
   hopping_term_PRECISION( f3, f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading );
   
   vector_PRECISION_minus( f1, f2, f3, 0, l->inner_vector_size, l );
-  norm = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading ));
+  diff = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading ));
   
-  printf0("depth: %d, correctness of odd even schur complement: %le\n", l->depth, norm );
-  if(norm > g.test) g.test = norm;
+  test0_PRECISION("depth: %d, correctness of odd even schur complement: %le\n", l->depth, diff );
     
   FREE( d1, complex_double, l->inner_vector_size );
   FREE( d2, complex_double, l->inner_vector_size );
diff --git a/src/oddeven_generic.h b/src/oddeven_generic.h
index 66cce02..4fac101 100644
--- a/src/oddeven_generic.h
+++ b/src/oddeven_generic.h
@@ -52,7 +52,7 @@ struct Thread;
   void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   
-  void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struct *l );
+  void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l );
   
   void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start,
                                                schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading );
diff --git a/src/operator_generic.c b/src/operator_generic.c
index 41a3895..2c783ea 100644
--- a/src/operator_generic.c
+++ b/src/operator_generic.c
@@ -29,13 +29,29 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) {
   op->backward_neighbor_table = NULL;
   op->translation_table = NULL;
   op->D = NULL;
+  op->D_vectorized = NULL;
+  op->D_transformed_vectorized = NULL;
   op->clover = NULL;
-  op->oe_clover = NULL;
-  op->oe_clover_vectorized = NULL;
+  op->clover_oo_inv = NULL;
+  op->clover_vectorized = NULL;
+  op->clover_oo_inv_vectorized = NULL;
+  op->m0 = 0;
 #ifdef HAVE_TM
+  op->mu = 0;
+  op->mu_even_shift = 0;
+  op->mu_odd_shift = 0;
   op->odd_proj = NULL;
   op->tm_term = NULL;
 #endif
+#ifdef HAVE_TM1p1
+  op->epsbar = 0;
+  op->epsbar_ig5_even_shift = 0;
+  op->epsbar_ig5_odd_shift = 0;
+  op->epsbar_term = NULL;
+  op->clover_doublet_oo_inv = NULL;
+  op->clover_doublet_vectorized = NULL;
+  op->clover_doublet_oo_inv_vectorized = NULL;
+#endif
   
   for ( int mu=0; mu<4; mu++ )
     op->config_boundary_table[mu] = NULL;
@@ -47,11 +63,6 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) {
   }
   op->c.comm = 1;
   op->buffer = NULL;
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-  op->D_vectorized = NULL;
-  op->D_transformed_vectorized = NULL;
-  op->clover_vectorized = NULL;
-#endif
 }
 
 
@@ -61,6 +72,9 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op,
   // g.method >= 4: then oddeven_setup_float() is called in init.c, method_setup().
   if ( l->depth == 0 ) {
     int its = (l->num_lattice_site_var/2)*l->num_lattice_sites;
+#ifdef HAVE_TM1p1
+    its *= 2;
+#endif
     MALLOC( op->prnT, complex_PRECISION, its*8 );
     op->prnZ = op->prnT + its; op->prnY = op->prnZ + its; op->prnX = op->prnY + its;
     op->prpT = op->prnX + its; op->prpZ = op->prpT + its; op->prpY = op->prpZ + its; op->prpX = op->prpY + its;
@@ -70,6 +84,9 @@ void operator_PRECISION_free_projection_buffers( operator_PRECISION_struct *op,
 
   if ( l->depth == 0 ) {
     int its = (l->num_lattice_site_var/2)*l->num_lattice_sites;
+#ifdef HAVE_TM1p1
+    its *= 2;
+#endif
     FREE( op->prnT, complex_PRECISION, its*8 );
   }
 }
@@ -102,35 +119,56 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le
     its *= (l->local_lattice[mu]+its_boundary);
   }
   
-  nls = (type==_ORDINARY)?l->num_inner_lattice_sites:2*l->num_lattice_sites-l->num_inner_lattice_sites;
+  nls = (type==_SCHWARZ) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites;
+
   MALLOC( op->D, complex_PRECISION, coupling_site_size*nls );
   MALLOC( op->clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites );
+
+  int block_site_size = ( l->depth == 0 ) ? 12 : (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1));
+  MALLOC( op->odd_proj, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
 #ifdef HAVE_TM
-  int tm_site_size;
-  if ( l->depth == 0 )
-    tm_site_size = 12;
-  else
-    tm_site_size = (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1));
-
-  MALLOC( op->tm_term, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites );
-  MALLOC( op->odd_proj, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites );
-  if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) //we use LU here
-    MALLOC( op->oe_clover, complex_PRECISION, 72*l->num_inner_lattice_sites );
-#else
-  if ( type == _SCHWARZ && l->depth == 0 && g.odd_even )
-    MALLOC( op->oe_clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites );
+  MALLOC( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
 #endif
+#ifdef HAVE_TM1p1
+  MALLOC( op->epsbar_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
+#endif
+
   MALLOC( op->index_table, int, its );
-  MALLOC( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
-  MALLOC( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
-  MALLOC( op->translation_table, int, l->num_inner_lattice_sites );
-#ifdef SSE
-  if ( l->depth == 0 ) {
-    MALLOC( op->oe_clover_vectorized, PRECISION, 144*l->num_inner_lattice_sites );    
+  if ( type ==_ODDEVEN ) {
+    MALLOC( op->neighbor_table, int, 5*its );
+    MALLOC( op->backward_neighbor_table, int, 5*its );
+  } else {
+    MALLOC( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
+    MALLOC( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
   }
+  MALLOC( op->translation_table, int, l->num_inner_lattice_sites );
+
+  if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) {
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+
+    if( g.csw ) {
+#ifdef HAVE_TM //we use LU here
+      MALLOC( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) );
+#else
+      MALLOC( op->clover_oo_inv, complex_PRECISION, clover_site_size*(l->num_inner_lattice_sites/2+1) );
 #endif
-  
-  operator_PRECISION_alloc_projection_buffers( op, l );
+    }
+#ifdef HAVE_TM1p1
+    MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) );
+#endif
+
+#else
+    if( g.csw )
+      MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION );
+#ifdef HAVE_TM1p1
+    MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION );
+#endif
+
+#endif
+  }  
+
+  if ( type != _ODDEVEN )
+    operator_PRECISION_alloc_projection_buffers( op, l );
   
   ghost_alloc_PRECISION( 0, &(op->c), l );
   
@@ -176,35 +214,54 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev
     its *= (l->local_lattice[mu]+its_boundary);
   }
   
-  int nls = (type==_ORDINARY)?l->num_inner_lattice_sites:2*l->num_lattice_sites-l->num_inner_lattice_sites;
+  int nls = (type==_SCHWARZ) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites) : l->num_inner_lattice_sites;
   FREE( op->D, complex_PRECISION, coupling_site_size*nls );
   FREE( op->clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites );
+
+  int block_site_size = ( l->depth == 0 ) ? 12 : (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1));
+  FREE( op->odd_proj, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
 #ifdef HAVE_TM
-  int tm_site_size;
-  if ( l->depth == 0 )
-    tm_site_size = 12;
-  else
-    tm_site_size = (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1));
-
-  FREE( op->tm_term, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites );
-  FREE( op->odd_proj, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites );
-  if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) //we use LU here
-    FREE( op->oe_clover, complex_PRECISION, 72*l->num_inner_lattice_sites );
+  FREE( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
+#endif
+  if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) {
+#ifndef OPTIMIZED_SELF_COUPLING_PRECISION
+
+    if( g.csw ) {
+#ifdef HAVE_TM //we use LU here
+      FREE( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) );
 #else
-  if ( type == _SCHWARZ && l->depth == 0 && g.odd_even )
-    FREE( op->oe_clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites );
+      FREE( op->clover_oo_inv, complex_PRECISION, clover_site_size*(l->num_inner_lattice_sites/2+1) );
+#endif
+    }
+#ifdef HAVE_TM1p1
+    FREE( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) );
+#endif
+
+#else
+    if( g.csw )
+      FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1) );
+#ifdef HAVE_TM1p1
+    FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1) );
+#endif
+
+#endif
+  }  
+
+#ifdef HAVE_TM1p1
+  FREE( op->epsbar_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites );
 #endif
   FREE( op->index_table, int, its );
-  FREE( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
-  FREE( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
-  FREE( op->translation_table, int, l->num_inner_lattice_sites );
-#ifdef SSE
-  if ( l->depth == 0 ) {
-    FREE( op->oe_clover_vectorized, PRECISION, 144*l->num_inner_lattice_sites );    
+  if ( type ==_ODDEVEN ) {
+    FREE( op->neighbor_table, int, 5*its );
+    FREE( op->backward_neighbor_table, int, 5*its );
+  } else {
+    FREE( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
+    FREE( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites );
   }
-#endif
+  FREE( op->translation_table, int, l->num_inner_lattice_sites );
   
-  operator_PRECISION_free_projection_buffers( op, l );
+  if ( type != _ODDEVEN )
+    operator_PRECISION_free_projection_buffers( op, l );
   
   ghost_free_PRECISION( &(op->c), l );
   
@@ -229,8 +286,8 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev
 
 void operator_PRECISION_define( operator_PRECISION_struct *op, level_struct *l ) {
   
-   int i, mu, t, z, y, x, *it = op->index_table,
-      ls[4], le[4], l_st[4], l_en[4], *dt = op->table_dim;
+  int i, mu, t, z, y, x, *it = op->index_table,
+    ls[4], le[4], l_st[4], l_en[4], *dt = op->table_dim;
   
   for ( mu=0; mu<4; mu++ ) {
     dt[mu] = l->local_lattice[mu]+1;
@@ -273,6 +330,36 @@ void operator_PRECISION_define( operator_PRECISION_struct *op, level_struct *l )
   define_nt_bt_tt( op->neighbor_table, op->backward_neighbor_table, op->c.boundary_table, op->translation_table, it, dt, l );
 }
 
+void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l ) {
+
+  operator_PRECISION_set_self_couplings( op, l );
+  operator_PRECISION_set_neighbor_couplings( op, l );
+
+}
+
+void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l ) {
+
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+  int i, n = 2*l->num_lattice_sites - l->num_inner_lattice_sites;
+
+  for ( i=0; i<n; i++ ) {
+    PRECISION *D_vectorized = op->D_vectorized + 96*i;
+    PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i;
+    complex_PRECISION *D_pt = op->D + 36*i;
+    for ( int mu=0; mu<4; mu++ )
+      set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_pt+9*mu );
+  }
+#endif
+
+}
+
+void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l ) {
+
+#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
+  if ( g.csw != 0 )
+    set_clover_vectorized_PRECISION( op, l, no_threading );
+#endif
+}
 
 void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
 
@@ -294,11 +381,11 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc
   PUBLIC_MALLOC( vd1, complex_double, 4*ivs );
   PUBLIC_MALLOC( vp1, complex_PRECISION, 2*ivs );
 
-  vd2 = vd1+ivs; vd3 = vd2+ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs;
+  vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs;
 
   START_LOCKED_MASTER(threading)
   
-  vector_double_define_random( vd1, 0, l->inner_vector_size, l );
+  vector_double_define_random( vd1, 0, l->inner_vector_size, l, no_threading );
   apply_operator_double( vd2, vd1, &(g.p), l, no_threading );
   
   trans_PRECISION( vp1, vd1, op->translation_table, l, no_threading );
@@ -306,9 +393,10 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc
   trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading );
   
   vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l );
-  diff = global_norm_double( vd4, 0, ivs, l, no_threading )/global_norm_double( vd3, 0, ivs, l, no_threading );
-  printf0("depth: 0, correctness of schwarz PRECISION Dirac operator: %le\n", diff );
-  if(diff > g.test) g.test = diff;
+  diff = global_norm_double( vd4, 0, ivs, l, no_threading )/
+    global_norm_double( vd3, 0, ivs, l, no_threading );
+
+  test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff );
   END_LOCKED_MASTER(threading)
 
   if(threading->n_core > 1) {
@@ -319,15 +407,20 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc
 
     START_LOCKED_MASTER(threading)
     trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading );
-    
     vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l );
-    diff = global_norm_double( vd4, 0, ivs, l, no_threading )/global_norm_double( vd3, 0, ivs, l, no_threading );
+    diff = global_norm_double( vd4, 0, ivs, l, no_threading ) /
+      global_norm_double( vd3, 0, ivs, l, no_threading );
 
-    printf0("depth: 0, correctness of schwarz PRECISION Dirac operator with threading: %le\n", diff );
+    if ( diff > EPS_PRECISION )
+      printf0("\x1b[31m");
+    printf0("depth: %d, correctness of schwarz PRECISION Dirac operator with threading: %le\n", l->depth, diff );
+    if ( diff > EPS_PRECISION )
+      printf0("\x1b[0m");
     if(diff > g.test) g.test = diff;
+
     END_LOCKED_MASTER(threading) 
   }    
-
+  
   PUBLIC_FREE( vd1, complex_double, 4*ivs );
   PUBLIC_FREE( vp1, complex_PRECISION, 2*ivs );
 
diff --git a/src/operator_generic.h b/src/operator_generic.h
index 966e56f..753cf04 100644
--- a/src/operator_generic.h
+++ b/src/operator_generic.h
@@ -28,6 +28,10 @@
   void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, level_struct *l );
   void operator_PRECISION_define( operator_PRECISION_struct *op, level_struct *l );
   void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, level_struct *l );
+
+  void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l );
+  void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l );
+  void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l );
   
   void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading );
   
diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c
index a9493ee..28be21b 100644
--- a/src/schwarz_generic.c
+++ b/src/schwarz_generic.c
@@ -27,6 +27,8 @@ void smoother_PRECISION_def( level_struct *l ) {
     schwarz_PRECISION_def( &(l->s_PRECISION), &(g.op_double), l );
   
   l->p_PRECISION.op = &(l->s_PRECISION.op);
+  l->p_PRECISION.v_start = 0;
+  l->p_PRECISION.v_end = l->inner_vector_size;
   if ( g.method == 6 ) {
     l->p_PRECISION.eval_operator = (l->depth > 0)?g5D_apply_coarse_operator_PRECISION:g5D_plus_clover_PRECISION;
   } else {
@@ -49,15 +51,12 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) {
   s->index[T] = NULL;
   s->oe_index[T] = NULL;
   s->block = NULL;
-  s->bbuf1 = NULL;
   s->buf1 = NULL;
   s->buf2 = NULL;
   s->buf3 = NULL;
   s->buf4 = NULL;
   s->buf5 = NULL;
   l->sbuf_PRECISION[0] = NULL;
-  s->oe_bbuf[0] = NULL;
-  s->oe_bbuf[1] = NULL;
   s->oe_buf[0] = NULL;
   s->oe_buf[1] = NULL;
   s->oe_buf[2] = NULL;
@@ -73,7 +72,7 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) {
 
 void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
   
-  int i, j, n, mu, nu, *bl = l->block_lattice, vs = (l->depth==0)?l->inner_vector_size:l->vector_size;
+  int i, j, n, mu, nu, *bl = l->block_lattice;
   
   if ( g.method == 4 ) {
     fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size,
@@ -140,22 +139,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
   }
   
   MALLOC( s->block, block_struct, s->num_blocks );
-  MALLOC( s->bbuf1, complex_PRECISION, (l->depth==0&&g.odd_even?9:3)*s->block_vector_size );
+
+  int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size;
+
+#ifdef HAVE_TM1p1
+  svs *= 2;
+  vs *= 2;
+#endif
+
   if ( l->depth == 0 ) {
-    MALLOC( s->oe_buf[0], complex_PRECISION, 4*l->inner_vector_size );
-    s->oe_buf[1] = s->oe_buf[0] + l->inner_vector_size;
-    s->oe_buf[2] = s->oe_buf[1] + l->inner_vector_size;
-    s->oe_buf[3] = s->oe_buf[2] + l->inner_vector_size;
-  }
-  s->bbuf2 = s->bbuf1 + s->block_vector_size;
-  s->bbuf3 = s->bbuf2 + s->block_vector_size;
-  if ( l->depth == 0 && g.odd_even ) {
-    s->oe_bbuf[0] = s->bbuf3 + s->block_vector_size;
-    s->oe_bbuf[1] = s->oe_bbuf[0] + s->block_vector_size;
-    s->oe_bbuf[2] = s->oe_bbuf[1] + s->block_vector_size;
-    s->oe_bbuf[3] = s->oe_bbuf[2] + s->block_vector_size;
-    s->oe_bbuf[4] = s->oe_bbuf[3] + s->block_vector_size;
-    s->oe_bbuf[5] = s->oe_bbuf[4] + s->block_vector_size;
+    MALLOC( s->oe_buf[0], complex_PRECISION, 4*vs );
+    s->oe_buf[1] = s->oe_buf[0] + vs;
+    s->oe_buf[2] = s->oe_buf[1] + vs;
+    s->oe_buf[3] = s->oe_buf[2] + vs;
   }
   
   n = 0;
@@ -176,24 +172,24 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
     s->block[i].bt = NULL;
     MALLOC( s->block[i].bt, int, n );
   }
-  
-  MALLOC( s->buf1, complex_PRECISION, vs+3*l->schwarz_vector_size );
+
+  MALLOC( s->buf1, complex_PRECISION, vs+3*svs );
   s->buf2 = s->buf1 + vs;
-  s->buf3 = s->buf2 + l->schwarz_vector_size;
-  s->buf4 = s->buf3 + l->schwarz_vector_size;
-  
+  s->buf3 = s->buf2 + svs;
+  s->buf4 = s->buf3 + svs;
+    
   if ( g.method == 1 )
-    MALLOC( s->buf5, complex_PRECISION, l->schwarz_vector_size );
+    MALLOC( s->buf5, complex_PRECISION, svs );
   
   MALLOC( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs );
   l->sbuf_PRECISION[1] = l->sbuf_PRECISION[0] + vs;
 
   // these buffers are introduced to make local_minres_PRECISION thread-safe
-  MALLOC( s->local_minres_buffer[0], complex_PRECISION, l->schwarz_vector_size );
-  MALLOC( s->local_minres_buffer[1], complex_PRECISION, l->schwarz_vector_size );
-  MALLOC( s->local_minres_buffer[2], complex_PRECISION, l->schwarz_vector_size );
+  MALLOC( s->local_minres_buffer[0], complex_PRECISION, svs );
+  MALLOC( s->local_minres_buffer[1], complex_PRECISION, svs );
+  MALLOC( s->local_minres_buffer[2], complex_PRECISION, svs );
   
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
   if ( l->depth == 0 ) {
     MALLOC_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION );
     MALLOC_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION );
@@ -202,6 +198,9 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
   if ( l->depth == 0 ) {
     MALLOC_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
+#ifdef HAVE_TM1p1
+    MALLOC_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION );
+#endif
   }
 #endif
 }
@@ -209,7 +208,7 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) {
 
 void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) {
   
-  int i, n, mu, nu, *bl = l->block_lattice, vs = (l->depth==0)?l->inner_vector_size:l->vector_size;
+  int i, n, mu, nu, *bl = l->block_lattice;
   
   if ( g.method == 4 || g.method == 5 || g.method == 6 )
     fgmres_PRECISION_struct_free( &(l->sp_PRECISION), l );
@@ -252,37 +251,42 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) {
   }
   
   FREE( s->block, block_struct, s->num_blocks );
-  FREE( s->bbuf1, complex_PRECISION, (l->depth==0&&g.odd_even?9:3)*s->block_vector_size );
-  if ( l->depth == 0 ) {
+
+  int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size;
+
+#ifdef HAVE_TM1p1
+  svs *= 2;
+  vs *= 2;
+#endif
+ if ( l->depth == 0 ) {
     s->oe_buf[1] = NULL;
     s->oe_buf[2] = NULL;
     s->oe_buf[3] = NULL;
-    FREE( s->oe_buf[0], complex_PRECISION, 4*l->inner_vector_size );
+    FREE( s->oe_buf[0], complex_PRECISION, 4*vs );
     s->oe_buf[0] = NULL;
   }
-  s->bbuf2 = NULL; s->bbuf3 = NULL; s->oe_bbuf[0] = NULL; s->oe_bbuf[1] = NULL;
-  s->oe_bbuf[2] = NULL; s->oe_bbuf[3] = NULL; s->oe_bbuf[4] = NULL; s->oe_bbuf[5] = NULL;
   
-  FREE( s->buf1, complex_PRECISION, vs+3*l->schwarz_vector_size );
+ 
+  FREE( s->buf1, complex_PRECISION, vs+3*svs );
   s->buf2 = NULL; s->buf3 = NULL;
   s->buf4 = NULL;
   
   if ( g.method == 1 )
-    FREE( s->buf5, complex_PRECISION, l->schwarz_vector_size );
+    FREE( s->buf5, complex_PRECISION, svs );
   
   operator_PRECISION_free( &(s->op), _SCHWARZ, l );
   
   FREE( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs );
   l->sbuf_PRECISION[1] = NULL;
 
-  FREE( s->local_minres_buffer[0], complex_PRECISION, l->schwarz_vector_size );
-  FREE( s->local_minres_buffer[1], complex_PRECISION, l->schwarz_vector_size );
-  FREE( s->local_minres_buffer[2], complex_PRECISION, l->schwarz_vector_size );
+  FREE( s->local_minres_buffer[0], complex_PRECISION, svs );
+  FREE( s->local_minres_buffer[1], complex_PRECISION, svs );
+  FREE( s->local_minres_buffer[2], complex_PRECISION, svs );
   s->local_minres_buffer[0] = NULL;
   s->local_minres_buffer[1] = NULL;
   s->local_minres_buffer[2] = NULL;
   
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
   if ( l->depth == 0 ) {
     FREE_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) );
     FREE_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) );
@@ -291,6 +295,9 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) {
 #ifdef OPTIMIZED_SELF_COUPLING_PRECISION
   if ( l->depth == 0 ) {
     FREE_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size );
+#ifdef HAVE_TM1p1
+    FREE_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size );  
+#endif
   }
 #endif
 }
@@ -726,245 +733,547 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc
   }
 }
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
+
 void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
                                   schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
-  int i, mu, index, neighbor_index, *bbl = s->block_boundary_length;
-  complex_PRECISION buf1[12], *buf2=buf1+6;
-  config_PRECISION D_pt, D = s->op.D;
-  vector_PRECISION phi_pt, eta_pt;
-  
-  mu=T;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_T_PRECISION( buf1, phi_pt );
-    mvm_PRECISION( buf2, D_pt, buf1 );
-    mvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_T_PRECISION( buf2, eta_pt );
-  }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_T_PRECISION( buf1, phi_pt );
-    mvmh_PRECISION( buf2, D_pt, buf1 );
-    mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_T_PRECISION( buf2, eta_pt );
-  }
+  int *bbl = s->block_boundary_length;
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
+  PRECISION *Dplus = s->op.D_vectorized;
+  PRECISION *Dminus = s->op.D_transformed_vectorized;
   
-  mu=Z;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_Z_PRECISION( buf1, phi_pt );
-    mvm_PRECISION( buf2, D_pt, buf1 );
-    mvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_Z_PRECISION( buf2, eta_pt );
-  }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_Z_PRECISION( buf1, phi_pt );
-    mvmh_PRECISION( buf2, D_pt, buf1 );
-    mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_Z_PRECISION( buf2, eta_pt );
+  for ( int mu=0; mu<4; mu++ ) {
+    boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi,
+                                              mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL );
+    boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi,
+                                               mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL );
   }
+#else
+  int i, mu, index, neighbor_index;
+  config_PRECISION D_pt, D = s->op.D;
+  vector_PRECISION phi_pt, eta_pt;
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    complex_PRECISION buf1[24], *buf2=buf1+12;
+    mu=T;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_T_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_T_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_T_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_T_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Z;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_Z_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_Z_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Y;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_Y_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Y_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_Y_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Y_PRECISION( buf2, eta_pt );
+    }
   
-  mu=Y;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_Y_PRECISION( buf1, phi_pt );
-    mvm_PRECISION( buf2, D_pt, buf1 );
-    mvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_Y_PRECISION( buf2, eta_pt );
-  }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_Y_PRECISION( buf1, phi_pt );
-    mvmh_PRECISION( buf2, D_pt, buf1 );
-    mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_Y_PRECISION( buf2, eta_pt );
-  }
+    mu=X;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_X_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_X_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_X_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      mvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      mvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_X_PRECISION( buf2, eta_pt );
+    }  
+  } else {
+#endif
+    complex_PRECISION buf1[12], *buf2=buf1+6;
+    mu=T;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_T_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_T_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_T_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_T_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Z;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_Z_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_Z_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Y;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_Y_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_Y_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_Y_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_Y_PRECISION( buf2, eta_pt );
+    }
   
-  mu=X;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_X_PRECISION( buf1, phi_pt );
-    mvm_PRECISION( buf2, D_pt, buf1 );
-    mvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_X_PRECISION( buf2, eta_pt );
+    mu=X;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_X_PRECISION( buf1, phi_pt );
+      mvm_PRECISION( buf2, D_pt, buf1 );
+      mvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_X_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_X_PRECISION( buf1, phi_pt );
+      mvmh_PRECISION( buf2, D_pt, buf1 );
+      mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_X_PRECISION( buf2, eta_pt );
+    }  
+#ifdef HAVE_TM1p1
   }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_X_PRECISION( buf1, phi_pt );
-    mvmh_PRECISION( buf2, D_pt, buf1 );
-    mvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_X_PRECISION( buf2, eta_pt );
-  }  
-}
 #endif
+#endif
+}
 
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
 void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
                                     schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
-  int i, mu, index, neighbor_index, *bbl = s->block_boundary_length;
-  complex_PRECISION buf1[12], *buf2=buf1+6;
-  config_PRECISION D_pt, D = s->op.D;
-  vector_PRECISION phi_pt, eta_pt;
+  int *bbl = s->block_boundary_length;
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
+  PRECISION *Dplus = s->op.D_vectorized;
+  PRECISION *Dminus = s->op.D_transformed_vectorized;
   
-  mu=T;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_T_PRECISION( buf1, phi_pt );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_T_PRECISION( buf2, eta_pt );
-  }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_T_PRECISION( buf1, phi_pt );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_T_PRECISION( buf2, eta_pt );
-  }
-  
-  mu=Z;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_Z_PRECISION( buf1, phi_pt );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_Z_PRECISION( buf2, eta_pt );
-  }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_Z_PRECISION( buf1, phi_pt );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_Z_PRECISION( buf2, eta_pt );
-  }
-  
-  mu=Y;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_Y_PRECISION( buf1, phi_pt );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_Y_PRECISION( buf2, eta_pt );
-  }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_Y_PRECISION( buf1, phi_pt );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_Y_PRECISION( buf2, eta_pt );
+  for ( int mu=0; mu<4; mu++ ) {
+    boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi,
+                                               mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL );
+    boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi,
+                                                mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL );
   }
+#else
+  int i, mu, index, neighbor_index;
+  config_PRECISION D_pt, D = s->op.D;
+  vector_PRECISION phi_pt, eta_pt;
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    complex_PRECISION buf1[24], *buf2=buf1+12;
+    mu=T;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_T_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_T_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_T_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_T_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Z;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_Z_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_Z_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Y;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_Y_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_Y_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_Y_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_Y_PRECISION( buf2, eta_pt );
+    }
   
-  mu=X;
-  // plus mu direction
-  for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prp_X_PRECISION( buf1, phi_pt );
-    nmvm_PRECISION( buf2, D_pt, buf1 );
-    nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbp_su3_X_PRECISION( buf2, eta_pt );
+    mu=X;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprp_X_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvm_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvm_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbp_su3_X_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 24*neighbor_index;
+      eta_pt = eta + 24*index;
+      dprn_X_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      nmvmh_PRECISION( buf2+6, D_pt, buf1+6 );
+      nmvmh_PRECISION( buf2+9, D_pt, buf1+9 );
+      dpbn_su3_X_PRECISION( buf2, eta_pt );
+    }
+  } else {
+#endif
+    complex_PRECISION buf1[12], *buf2=buf1+6;
+    mu=T;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_T_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_T_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_T_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_T_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Z;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_Z_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_Z_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_Z_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=Y;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_Y_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_Y_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_Y_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_Y_PRECISION( buf2, eta_pt );
+    }
+    
+    mu=X;
+    // plus mu direction
+    for ( i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prp_X_PRECISION( buf1, phi_pt );
+      nmvm_PRECISION( buf2, D_pt, buf1 );
+      nmvm_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbp_su3_X_PRECISION( buf2, eta_pt );
+    }
+    // minus mu direction
+    for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      index = s->block[k].bt[i];
+      neighbor_index = s->block[k].bt[i+1];
+      D_pt = D + 36*neighbor_index + 9*mu;
+      phi_pt = phi + 12*neighbor_index;
+      eta_pt = eta + 12*index;
+      prn_X_PRECISION( buf1, phi_pt );
+      nmvmh_PRECISION( buf2, D_pt, buf1 );
+      nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
+      pbn_su3_X_PRECISION( buf2, eta_pt );
+    } 
+#ifdef HAVE_TM1p1
   }
-  // minus mu direction
-  for ( i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-    index = s->block[k].bt[i];
-    neighbor_index = s->block[k].bt[i+1];
-    D_pt = D + 36*neighbor_index + 9*mu;
-    phi_pt = phi + 12*neighbor_index;
-    eta_pt = eta + 12*index;
-    prn_X_PRECISION( buf1, phi_pt );
-    nmvmh_PRECISION( buf2, D_pt, buf1 );
-    nmvmh_PRECISION( buf2+3, D_pt, buf1+3 );
-    pbn_su3_X_PRECISION( buf2, eta_pt );
-  }  
-}
 #endif
+#endif 
+}
 
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
 void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
                                          int k, schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
   int *bbl = s->block_boundary_length, n = l->num_lattice_site_var;
+
+#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset;
+  
+  for ( int mu=0; mu<4; mu++ ) {
+    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset;
+    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset;
+    // plus mu direction
+    for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      int index = s->block[k].bt[i];
+      int neighbor_index = s->block[k].bt[i+1];
+      vector_PRECISION phi_pt = phi + n*neighbor_index;
+      vector_PRECISION eta_pt = eta + n*index;
+      coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, +1, l );
+    }
+    // minus mu direction
+    for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      int index = s->block[k].bt[i];
+      int neighbor_index = s->block[k].bt[i+1];
+      vector_PRECISION phi_pt = phi + n*neighbor_index;
+      vector_PRECISION eta_pt = eta + n*index;
+      coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, +1, l );
+    }
+  }
+#else
   config_PRECISION D = s->op.D;
-  int link_size = SQUARE(l->num_lattice_site_var), site_size=4*link_size;
+  int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size;
   
   for ( int mu=0; mu<4; mu++ ) {
     // plus mu direction
@@ -974,7 +1283,7 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION
       vector_PRECISION phi_pt = phi + n*neighbor_index;
       vector_PRECISION eta_pt = eta + n*index;
       config_PRECISION D_pt = D + site_size*index + link_size*mu;
-      coarse_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_pn_hopp_PRECISION( eta_pt, phi_pt, D_pt, +1, l );
     }
     // minus mu direction
     for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
@@ -983,18 +1292,43 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION
       vector_PRECISION phi_pt = phi + n*neighbor_index;
       vector_PRECISION eta_pt = eta + n*index;
       config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu;
-      coarse_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_pn_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, +1, l );
     }
   }
-}
 #endif
+}
+
 
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
 void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
                                            int k, schwarz_PRECISION_struct *s, level_struct *l ) {
   // k: number of current block
   int *bbl = s->block_boundary_length, n = l->num_lattice_site_var;
-  int link_size = SQUARE(l->num_lattice_site_var), site_size=4*link_size;
+#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION
+  int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+  int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset;
+  
+  for ( int mu=0; mu<4; mu++ ) {
+    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset;
+    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset;
+    // plus mu direction
+    for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
+      int index = s->block[k].bt[i];
+      int neighbor_index = s->block[k].bt[i+1];
+      vector_PRECISION phi_pt = phi + n*neighbor_index;
+      vector_PRECISION eta_pt = eta + n*index;
+      coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, -1, l );
+    }
+    // minus mu direction
+    for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
+      int index = s->block[k].bt[i];
+      int neighbor_index = s->block[k].bt[i+1];
+      vector_PRECISION phi_pt = phi + n*neighbor_index;
+      vector_PRECISION eta_pt = eta + n*index;
+      coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, -1, l );
+    }
+  }  
+#else
+  int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size;
   config_PRECISION D = s->op.D;
   
   for ( int mu=0; mu<4; mu++ ) {
@@ -1005,7 +1339,7 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO
       vector_PRECISION phi_pt = phi + n*neighbor_index;
       vector_PRECISION eta_pt = eta + n*index;
       config_PRECISION D_pt = D + site_size*index + link_size*mu;
-      coarse_n_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_pn_hopp_PRECISION( eta_pt, phi_pt, D_pt, -1, l );
     }
     // minus mu direction
     for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
@@ -1014,13 +1348,13 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO
       vector_PRECISION phi_pt = phi + n*neighbor_index;
       vector_PRECISION eta_pt = eta + n*index;
       config_PRECISION D_pt = D + site_size*neighbor_index + link_size*mu;
-      coarse_n_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, l );
+      coarse_pn_daggered_hopp_PRECISION( eta_pt, phi_pt, D_pt, -1, l );
     }
   }
-}
 #endif
+}
+
 
-#if !defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) && !defined(OPTIMIZED_SELF_COUPLING_PRECISION)
 void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struct *op_in, level_struct *l ) {
 
 /*********************************************************************************  
@@ -1030,51 +1364,55 @@ void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struc
 *********************************************************************************/
 
   int i, index, n = l->num_inner_lattice_sites, *tt = s->op.translation_table;
-  config_PRECISION D_out_pt, clover_out_pt;
-  config_double D_in_pt = op_in->D, clover_in_pt = op_in->clover;
+  config_PRECISION D_out_pt, clover_out_pt, odd_proj_out_pt;
+  config_double D_in_pt = op_in->D, clover_in_pt = op_in->clover, odd_proj_in_pt = op_in->odd_proj;
   
+  s->op.m0 = op_in->m0;
+
   for ( i=0; i<n; i++ ) {
     index = tt[i];
     D_out_pt = s->op.D + 36*index;
-    FOR36( *D_out_pt = (complex_PRECISION) *D_in_pt; D_out_pt++; D_in_pt++; )
+    FOR36( *D_out_pt = (complex_PRECISION) *D_in_pt; D_out_pt++; D_in_pt++; );
   }
   
   if ( g.csw != 0 ) {
     for ( i=0; i<n; i++ ) {
       index = tt[i];
       clover_out_pt = s->op.clover + 42*index;
-      FOR42( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; )
+      FOR42( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; );
     }
   } else {
     for ( i=0; i<n; i++ ) {
       index = tt[i];
       clover_out_pt = s->op.clover + 12*index;
-      FOR12( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; )
+      FOR12( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; );
     }
   }
 
-#ifdef HAVE_TM
-  config_PRECISION tm_term_out_pt, odd_proj_out_pt;
-  config_double tm_term_in_pt = op_in->tm_term, odd_proj_in_pt = op_in->odd_proj;
-  for ( i=0; i<n; i++ ) {
-    index = tt[i];
-    tm_term_out_pt = s->op.tm_term + 12*index;
-    FOR12( *tm_term_out_pt = (complex_PRECISION) *tm_term_in_pt; tm_term_out_pt++; tm_term_in_pt++; )
-  }
-
   for ( i=0; i<n; i++ ) {
     index = tt[i];
     odd_proj_out_pt = s->op.odd_proj + 12*index;
-    FOR12( *odd_proj_out_pt = (complex_PRECISION) *odd_proj_in_pt; odd_proj_out_pt++; odd_proj_in_pt++; )
+    FOR12( *odd_proj_out_pt = (complex_PRECISION) *odd_proj_in_pt; odd_proj_out_pt++; odd_proj_in_pt++; );
   }
+
+#ifdef HAVE_TM
+  tm_term_PRECISION_setup( (PRECISION) (g.mu_factor[l->depth]*op_in->mu), (PRECISION) (g.mu_factor[l->depth]*op_in->mu_even_shift), (PRECISION) (g.mu_factor[l->depth]*op_in->mu_odd_shift), &(s->op), l, no_threading );
 #endif  
-  
-  if ( g.odd_even )
-    schwarz_PRECISION_oddeven_setup( &(s->op), l );
-  
+
+#ifdef HAVE_TM1p1
+  epsbar_term_PRECISION_setup( (PRECISION) (g.epsbar_factor[l->depth]*op_in->epsbar), (PRECISION) (g.epsbar_factor[l->depth]*op_in->epsbar_ig5_even_shift), (PRECISION) (g.epsbar_factor[l->depth]*op_in->epsbar_ig5_odd_shift), &(s->op), l, no_threading );
+#endif
+
   schwarz_PRECISION_boundary_update( s, l );
+  
+  operator_PRECISION_set_couplings( &(s->op), l );
+
+  if ( g.method >= 4 && g.odd_even )
+    oddeven_setup_PRECISION( &(g.op_double), l );
+  else
+    schwarz_PRECISION_oddeven_setup( s, l );
+
 }
-#endif
 
 void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, 
                                  schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
@@ -1083,10 +1421,11 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
   
   int k, mu, i, nb = s->num_blocks;
   vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3, latest_iter2 = s->buf5, swap = NULL;
-  void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION,
-       (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
+  void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
+  void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
-       (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
+       (* block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
+
 
   int nb_thread_start;
   int nb_thread_end;
@@ -1096,17 +1435,13 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
   
   if ( res == _NO_RES ) {
     vector_PRECISION_copy( r, eta, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
-    vector_PRECISION_define( x, 0, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
+    vector_PRECISION_define_zero( x, 0, nb*s->block_vector_size, l, threading );
+    vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading );
   } else {
     vector_PRECISION_copy( x, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
     vector_PRECISION_copy( latest_iter, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
   }
   
-  START_MASTER(threading)
-  if ( res == _NO_RES ) {
-    vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l );
-  }
-  END_MASTER(threading)
   
   SYNC_CORES(threading)
   
@@ -1132,7 +1467,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
             boundary_op( Dphi, latest_iter, i, s, l, no_threading );
             vector_PRECISION_minus( r, eta, Dphi, s->block[i].start*l->num_lattice_site_var,
                                     s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
-          } else {
+        } else {
             n_boundary_op( r, latest_iter, i, s, l );
           }
         }
@@ -1176,7 +1511,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
 
   for ( i=nb_thread_start; i<nb_thread_end; i++ ) {
     if ( l->relax_fac != 1.0 )
-      vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+      vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
     else
       vector_PRECISION_copy( phi, x, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
   }
@@ -1198,7 +1533,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
             s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 )
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var,
-              s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
       }
     }
     
@@ -1217,7 +1552,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
             s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 )
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var,
-              s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
       }
     }
   }
@@ -1248,9 +1583,9 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v
       }
     }
   }
-  double rnorm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading );
+  PRECISION r_norm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading );
   char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number );
-  printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm );
+  printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm );
   printf0("\033[0m\n"); fflush(0);
   END_LOCKED_MASTER(threading)
 #endif
@@ -1269,15 +1604,12 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
   vector_PRECISION Dphi = s->buf4;
   vector_PRECISION latest_iter = s->buf2;
   vector_PRECISION x = s->buf3;
-  void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION,
-       (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
+  void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
+  void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
        (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
   void (*communicate[2])() = {ghost_update_wait_PRECISION, ghost_update_PRECISION};
   int commdir[8] = {+1,-1,-1,+1,-1,+1,+1,-1};
-#ifdef SCHWARZ_RES
-  int nb = s->num_blocks;
-#endif
        
   SYNC_CORES(threading)
        
@@ -1289,10 +1621,8 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
   
   if ( res == _NO_RES ) {
     vector_PRECISION_copy( r, eta, start, end, l );
-    vector_PRECISION_define( x, 0, start, end, l );
-    START_MASTER(threading)
-    vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l );
-    END_MASTER(threading)
+    vector_PRECISION_define_zero( x, 0, l->inner_vector_size, l, threading );
+    vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading );
     SYNC_CORES(threading)
   } else {
     vector_PRECISION_copy( x, phi, start, end, l );
@@ -1350,7 +1680,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
   
   // copy phi = x
   if ( l->relax_fac != 1.0 )
-    vector_PRECISION_scale( phi, x, l->relax_fac, start, end, l );
+    vector_PRECISION_scale( phi, x, l->relax_fac, start, end, l, no_threading );
   else
     vector_PRECISION_copy( phi, x, start, end, l );
   
@@ -1363,7 +1693,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
                                 s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 ) {
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[index].start*l->num_lattice_site_var,
-                                  s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
         }
       }
     }
@@ -1380,7 +1710,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
                                 s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 ) {
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[index].start*l->num_lattice_site_var,
-                                  s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
         }
         START_MASTER(threading)
         PROF_PRECISION_STOP( _SM3, 1 );
@@ -1388,8 +1718,9 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
       }
       if ( step == 0 || step == 1 ) {
         START_LOCKED_MASTER(threading)
-        for ( mu=0; mu<4; mu++ )
+        for ( mu=0; mu<4; mu++ ) {
           communicate[0]( latest_iter, mu, commdir[step], &(s->op.c), l );
+        }
         END_LOCKED_MASTER(threading)
       } else {
         SYNC_CORES(threading)
@@ -1399,6 +1730,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
   SYNC_CORES(threading)
   
 #ifdef SCHWARZ_RES
+  int nb = s->num_blocks;
   START_LOCKED_MASTER(threading)
   if ( D_phi == NULL ) {
     for ( mu=0; mu<4; mu++ ) {
@@ -1406,26 +1738,23 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi,
       ghost_update_PRECISION( latest_iter, mu, -1, &(s->op.c), l );
     }
     
-    for ( i=0; i<nb; i++ ) {
-      if ( s->block[i].no_comm ) {
+    for ( i=0; i<nb; i++ )
+      if ( s->block[i].no_comm )
         n_boundary_op( r, latest_iter, i, s, l );
-      }
-    }
     
     for ( mu=0; mu<4; mu++ ) {
       ghost_update_wait_PRECISION( latest_iter, mu, +1, &(s->op.c), l );
       ghost_update_wait_PRECISION( latest_iter, mu, -1, &(s->op.c), l );
     }
     
-    for ( i=0; i<nb; i++ ) {
-      if ( !s->block[i].no_comm ) {
+    for ( i=0; i<nb; i++ )
+      if ( !s->block[i].no_comm )
         n_boundary_op( r, latest_iter, i, s, l );
-      }
-    }
+
   }
-  double rnorm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading );
+  PRECISION r_norm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading );
   char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number );
-  printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm );
+  printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm );
   printf0("\033[0m\n"); fflush(0);
   END_LOCKED_MASTER(threading)
 #endif
@@ -1443,8 +1772,8 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
   vector_PRECISION Dphi = s->buf4;
   vector_PRECISION latest_iter = s->buf2;
   vector_PRECISION x = s->buf3;
-  void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION,
-       (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
+  void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
+  void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
        (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
   
@@ -1456,17 +1785,12 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
   
   if ( res == _NO_RES ) {
     vector_PRECISION_copy( r, eta, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
-    vector_PRECISION_define( x, 0, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
+    vector_PRECISION_define_zero( x, 0, nb*s->block_vector_size, l, threading );
+    vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading );
   } else {
     vector_PRECISION_copy( x, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
   }
     
-  START_MASTER(threading)
-  if ( res == _NO_RES ) {
-    vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l );
-  }
-  END_MASTER(threading)
-  
   SYNC_CORES(threading)
   
   for ( k=0; k<cycles; k++ ) {
@@ -1559,7 +1883,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
   
   for ( i=nb_thread_start; i<nb_thread_end; i++ ) {
     if ( l->relax_fac != 1.0 )
-      vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+      vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
     else
       vector_PRECISION_copy( phi, x, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
   }
@@ -1581,7 +1905,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
                                 s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 ) {
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var,
-                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
         }
       }
       if ( 1 == s->block[i].color ) {
@@ -1589,7 +1913,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
                                 s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 ) {
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var,
-                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
         }
       }
     }
@@ -1608,7 +1932,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
                                 s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
         if ( l->relax_fac != 1.0 ) {
           vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var,
-                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l );
+                                  s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading );
         }
       }
     }    
@@ -1623,26 +1947,24 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE
       ghost_update_PRECISION( latest_iter, mu, -1, &(s->op.c), l );
     }
     
-    for ( i=0; i<nb; i++ ) {
-      if ( s->block[i].no_comm ) {
+    for ( i=0; i<nb; i++ )
+      if ( s->block[i].no_comm ) 
         n_boundary_op( r, latest_iter, i, s, l );
-      }
-    }
+
     
     for ( mu=0; mu<4; mu++ ) {
       ghost_update_wait_PRECISION( latest_iter, mu, +1, &(s->op.c), l );
       ghost_update_wait_PRECISION( latest_iter, mu, -1, &(s->op.c), l );
     }
     
-    for ( i=0; i<nb; i++ ) {
-      if ( !s->block[i].no_comm ) {
+    for ( i=0; i<nb; i++ )
+      if ( !s->block[i].no_comm ) 
         n_boundary_op( r, latest_iter, i, s, l );
-      }
-    }
   }
-  double rnorm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading );
+
+  PRECISION r_norm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading );
   char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number );
-  printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm );
+  printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm );
   printf0("\033[0m\n"); fflush(0);
   END_LOCKED_MASTER(threading)
 #endif
@@ -1661,10 +1983,10 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
   else {
     int color, k, mu, i, nb = s->num_blocks;
     vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3;
-    void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION,
-        (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
-        (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
-        (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
+    void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
+    void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op,
+         (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op,
+         (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION;
         
     int color_to_comm[16][2] = { {T,-1}, {X,+1}, {Y,+1}, {X,-1}, {Z,+1}, {Y,-1}, {X,+1}, {Y,+1},
                                 {T,+1}, {X,-1}, {Y,-1}, {X,+1}, {Z,-1}, {Y,+1}, {X,-1}, {Y,-1}  };
@@ -1677,16 +1999,12 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
     
     if ( res == _NO_RES ) {
       vector_PRECISION_copy( r, eta, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
-      vector_PRECISION_define( x, 0, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
+      vector_PRECISION_define_zero( x, 0, nb*s->block_vector_size, l, threading );
+      vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading );
     } else {
       vector_PRECISION_copy( x, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
     }
     
-    START_MASTER(threading)
-    if ( res == _NO_RES ) {
-      vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l );
-    }
-    END_MASTER(threading)
     
     SYNC_CORES(threading)
     
@@ -1764,7 +2082,7 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
     SYNC_CORES(threading)
     
     if ( l->relax_fac != 1.0 )
-      vector_PRECISION_scale( phi, x, l->relax_fac, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
+      vector_PRECISION_scale( phi, x, l->relax_fac, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l, no_threading );
     else
       vector_PRECISION_copy( phi, x, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l );
     
@@ -1773,8 +2091,11 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
 #ifdef SCHWARZ_RES
     START_LOCKED_MASTER(threading)
     vector_PRECISION true_r = NULL;
+
     PUBLIC_MALLOC( true_r, complex_PRECISION, l->vector_size );
-    vector_PRECISION_define( true_r, 0, 0, l->inner_vector_size, l );
+    vector_PRECISION_define_zero( true_r, 0, l->inner_vector_size, l, no_threading );
+
+
     if ( D_phi == NULL ) {
       for ( mu=0; mu<4; mu++ ) {
         ghost_update_PRECISION( x, mu, +1, &(s->op.c), l );
@@ -1791,11 +2112,12 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p
         boundary_op( true_r, x, i, s, l );
       }
     }
-    vector_PRECISION_saxpy( true_r, eta, true_r, -1, 0, l->inner_vector_size, l );
-    double rnorm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading )
-                 / global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading );
+    vector_PRECISION_saxpy( true_r, eta, true_r, -1, 0, l->inner_vector_size, l, no_threading );
+    PRECISION r_norm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading ),
+      den = global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading );
+    r_norm/=den;
     char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number );
-    printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm );
+    printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm );
     printf0("\033[0m\n"); fflush(0);
     PUBLIC_FREE( true_r, complex_PRECISION, l->vector_size );
     END_LOCKED_MASTER(threading)
@@ -1816,6 +2138,16 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str
   // this function seems to do some data reordering, barriers ensure that everything is in sync
   SYNC_CORES(threading)
   START_NO_HYPERTHREADS(threading)
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 )
+    for ( i=start; i<end; i++ ) {
+      index = tt[i];
+      out_pt = out + 24*index;
+      in_pt  = in + 24*i;
+      FOR24( *out_pt = (complex_PRECISION) *in_pt; out_pt++; in_pt++; )
+    }
+  else
+#endif
   for ( i=start; i<end; i++ ) {
     index = tt[i];
     out_pt = out + 12*index;
@@ -1837,6 +2169,16 @@ void trans_back_PRECISION( vector_double out, vector_PRECISION in, int *tt, leve
   // this function seems to do some data reordering, barriers ensure that everything is in sync
   SYNC_CORES(threading)
   START_NO_HYPERTHREADS(threading)
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 )
+    for ( i=start; i<end; i++ ) {
+      index = tt[i];
+      in_pt = in + 24*index;
+      out_pt = out + 24*i;
+      FOR24( *out_pt = (complex_double) *in_pt; out_pt++; in_pt++; )
+    }
+  else
+#endif
   for ( i=start; i<end; i++ ) {
     index = tt[i];
     in_pt = in + 12*index;
@@ -1861,42 +2203,46 @@ void schwarz_PRECISION_mvm_testfun( schwarz_PRECISION_struct *s, level_struct *l
   START_UNTHREADED_FUNCTION(threading)
 
   int mu, i, nb = s->num_blocks;
+  int svs = l->schwarz_vector_size;
+  int ivs = l->inner_vector_size;
+  int vs = l->vector_size;
+
   void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION;
-  void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op;
   void (*op)() = (l->depth==0)?d_plus_clover_PRECISION:apply_coarse_operator_PRECISION;
+  void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op;
   
   vector_PRECISION v1 = NULL, v2 = NULL, v3 = NULL;
-  PRECISION norm;
-  
-  MALLOC( v1, complex_PRECISION, l->schwarz_vector_size );
-  MALLOC( v2, complex_PRECISION, l->vector_size );
-  MALLOC( v3, complex_PRECISION, l->vector_size );
-  
-  vector_PRECISION_define_random( v1, 0, l->inner_vector_size, l );
+  PRECISION diff;
   
+  MALLOC( v1, complex_PRECISION, svs );
+  MALLOC( v2, complex_PRECISION, vs );
+  MALLOC( v3, complex_PRECISION, vs );
+
+  vector_PRECISION_define_random( v1, 0, ivs, l, no_threading );
+
   op( v3, v1, &(s->op), l, no_threading );
-  
+
   for ( mu=0; mu<4; mu++ ) {
     ghost_update_PRECISION( v1, mu, +1, &(s->op.c), l );
     ghost_update_PRECISION( v1, mu, -1, &(s->op.c), l );
   }
-  
+      
   for ( mu=0; mu<4; mu++ ) {
     ghost_update_wait_PRECISION( v1, mu, +1, &(s->op.c), l );
     ghost_update_wait_PRECISION( v1, mu, -1, &(s->op.c), l );
   }
-  
+
   for ( i=0; i<nb; i++ ) {
     block_op( v2, v1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading );
     boundary_op( v2, v1, i, s, l, no_threading );
   }
-  
+
   vector_PRECISION_minus( v3, v3, v2, 0, l->inner_vector_size, l );  
-  norm = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading );
+  diff = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading ) /
+    global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading );
   
-  printf0("depth: %d, correctness of local residual vector: %le\n", l->depth, norm );
-  if(norm > g.test) g.test = norm;
-    
+  test0_PRECISION("depth: %d, correctness of local residual vector: %le\n", l->depth, diff );
+      
   FREE( v1, complex_PRECISION, l->schwarz_vector_size );
   FREE( v2, complex_PRECISION, l->vector_size );
   FREE( v3, complex_PRECISION, l->vector_size );
diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h
index 2834fc5..fab1613 100644
--- a/src/schwarz_generic.h
+++ b/src/schwarz_generic.h
@@ -73,5 +73,23 @@ struct Thread;
       return site_index( coord[T], coord[Z], coord[Y], coord[X], dt, it );
     }
   }
-  
+
+#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float
+static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) {
+  // out1: column major, out2: row major
+  for ( int i=0; i<3; i++ ) { // column
+    for ( int j=0; j<3; j++ ) { // row
+      out1[8*i  +j] = creal_PRECISION(in[3*j+i]);
+      out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]);
+      out2[8*i  +j] = creal_PRECISION(in[j+3*i]);
+      out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]);
+    }
+    out1[8*i+3] = 0.0;
+    out1[8*i+7] = 0.0;
+    out2[8*i+3] = 0.0;
+    out2[8*i+7] = 0.0;
+  }
+}
+#endif
+
 #endif 
diff --git a/src/setup_generic.c b/src/setup_generic.c
index 6280f98..61013da 100644
--- a/src/setup_generic.c
+++ b/src/setup_generic.c
@@ -32,7 +32,7 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
     
     START_LOCKED_MASTER(threading)
     coarse_operator_PRECISION_alloc( l );
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
     coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
     END_LOCKED_MASTER(threading)
 #else
@@ -53,16 +53,6 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
     } else {
       interpolation_PRECISION_dummy_alloc( l->next_level );
     }
-
-#ifdef HAVE_TM
-    l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth];
-    l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth];
-    l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth];
-
-    if( g.tm_mu_factor[l->next_level->depth]!=g.tm_mu_factor[l->depth] )
-      tm_term_PRECISION_setup( l->next_level->op_PRECISION.tm_term, l->next_level->op_PRECISION.odd_proj, l->next_level, no_threading );
-#endif
-    
     conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level );
     
     END_LOCKED_MASTER(threading)
@@ -73,8 +63,9 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
       END_LOCKED_MASTER(threading)
       if ( g.method >= 4 && g.odd_even ) {
         START_LOCKED_MASTER(threading)
-        coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level );
+        coarse_oddeven_alloc_PRECISION( l->next_level );
         END_LOCKED_MASTER(threading)
+        coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
       }
       coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
       START_LOCKED_MASTER(threading)
@@ -83,8 +74,9 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
     }
     if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) {
       START_LOCKED_MASTER(threading)
-      coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level );
+      coarse_oddeven_alloc_PRECISION( l->next_level );
       END_LOCKED_MASTER(threading)
+      coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
     } else if ( !l->next_level->idle && l->next_level->level == 0 ) {
       coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
     }
@@ -101,13 +93,12 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr
       for ( int i=0; i<MIN(l->next_level->num_eig_vect,l->num_eig_vect); i++ ) {
         restrict_PRECISION( l->next_level->is_PRECISION.test_vector[i], l->is_PRECISION.test_vector[i], l, threading );
       }
-      START_LOCKED_MASTER(threading)
       for ( int i=MIN(l->next_level->num_eig_vect,l->num_eig_vect); i<l->next_level->num_eig_vect; i++ ) {
         if ( !l->next_level->idle )
           vector_PRECISION_define_random( l->next_level->is_PRECISION.test_vector[i], 0,
-                                          l->next_level->inner_vector_size, l->next_level );
+                                          l->next_level->inner_vector_size, l->next_level, threading );
       }
-      END_LOCKED_MASTER(threading)
+      SYNC_CORES(threading);
     }
     if ( !l->next_level->idle )
       interpolation_PRECISION_define( NULL, l->next_level, threading );
@@ -226,19 +217,14 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
     
     for ( k=0; k<n; k++ ) {
       if ( l->depth == 0 ) {
-        START_LOCKED_MASTER(threading)
-        vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l );
-        END_LOCKED_MASTER(threading)
+        vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l, threading );
       }
-      
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k],
-                          1, _NO_RES, _NO_SHIFT, l, threading );
+      SYNC_CORES(threading)
+      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], 1, _NO_RES, l, threading );
       vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k],
-                          g.method>=4?1:2, _NO_RES, _NO_SHIFT, l, threading );
+      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:2, _NO_RES, l, threading );
       vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k],
-                          g.method>=4?1:3, _NO_RES, _NO_SHIFT, l, threading );
+      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:3, _NO_RES, l, threading );
       vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
         
       pc += 6;
@@ -270,16 +256,16 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
     }
   }
 
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
+#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
   for ( k=0; k<n; k++ )
     vector_PRECISION_copy( l->is_PRECISION.interpolation[k], l->is_PRECISION.test_vector[k], start, end, l );
 #endif
     
   testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading );
 
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
+#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
   define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading );  
-  gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading );
+  gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, n, l, threading );
 #else
   gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading );
   define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading );
@@ -292,17 +278,17 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) {
   
   if ( l->level > 0 ) {
     if ( !l->idle ) {
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
+#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
       define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading );
-      gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
+      gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
       if ( l->depth > 0 )
-        gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
+        gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
       coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
       START_LOCKED_MASTER(threading)
 #else
       for ( int i=0; i<l->num_eig_vect; i++ ) {
         vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i],
-            threading->start_index[l->depth], threading->end_index[l->depth], l );
+                               threading->start_index[l->depth], threading->end_index[l->depth], l );
       }
       gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading );
       if ( l->depth > 0 )
@@ -310,15 +296,6 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) {
       define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading );
       START_LOCKED_MASTER(threading)
       coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l );
-#endif
-#ifdef HAVE_TM
-    l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth];
-    l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth];
-    l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth];
-
-    if( g.tm_mu_factor[l->next_level->depth]!=g.tm_mu_factor[l->depth] )
-      tm_term_PRECISION_setup( l->next_level->op_PRECISION.tm_term, l->next_level->op_PRECISION.odd_proj,
-			       l->next_level, no_threading );
 #endif
       conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level );
       END_LOCKED_MASTER(threading)
@@ -327,13 +304,13 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) {
         schwarz_PRECISION_boundary_update( &(l->next_level->s_PRECISION), l->next_level );
         END_LOCKED_MASTER(threading)
         if ( g.method >= 4 && g.odd_even ) {
-          coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
+          coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
         } else {
           coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
         }
       }
       if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) {
-        coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
+        coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
       } else if ( !l->next_level->idle && l->next_level->level == 0 ) {
         coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
       }
@@ -379,14 +356,15 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
             fgmres_PRECISION( &gmres, l->next_level, threading );
           }
         }
-        interpolate3_PRECISION( buf1, gmres.x, l, threading );
-        smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, _NO_SHIFT, l, threading );
+        vector_PRECISION_define_zero( buf1, 0, l->inner_vector_size, l, threading );
+        interpolate_PRECISION( buf1, gmres.x, l, threading );
+        smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, l, threading );
         vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], buf1,
                                      1.0/global_norm_PRECISION( buf1, 0, l->inner_vector_size, l, threading ),
                                      threading->start_index[l->depth], threading->end_index[l->depth], l );
         pc += l->post_smooth_iter;
 #ifdef DEBUG
-	START_MASTER(threading)
+        START_MASTER(threading)
         if ( pc >= 0.2*pi*pn ) { printf0("%4d%% |", 20*pi); fflush(0); pi++; }
         END_MASTER(threading)
 #endif
@@ -397,11 +375,11 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
       END_MASTER(threading)
 #endif
 
-#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION
+#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION
       define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading );
-      gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
+      gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
       if ( l->depth > 0 )
-        gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
+        gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading );
       coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading );
       START_LOCKED_MASTER(threading)
 #else
@@ -422,13 +400,13 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s
         schwarz_PRECISION_boundary_update( &(l->next_level->s_PRECISION), l->next_level );
         END_LOCKED_MASTER(threading)
         if ( g.method >= 4 && g.odd_even ) {
-          coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
+          coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading );
         } else {
           coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
         }
       }
       if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) {
-        coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
+        coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading );
       } else if ( !l->next_level->idle && l->next_level->level == 0 ) {
         coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading );
       }
@@ -552,7 +530,7 @@ void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct
     coarse_gamma5_PRECISION( l->vbuf_PRECISION[0], l->vbuf_PRECISION[3], 0, l->inner_vector_size, l );
     lambda = global_inner_product_PRECISION( test_vectors[i], l->vbuf_PRECISION[0], 0, l->inner_vector_size, l, no_threading );
     lambda /= global_inner_product_PRECISION( test_vectors[i], test_vectors[i], 0, l->inner_vector_size, l, no_threading );
-    vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l );
+    vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l, no_threading );
     mu = global_norm_PRECISION( l->vbuf_PRECISION[1], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading );
     printf0("singular value: %+lf%+lfi, singular vector precision: %le\n", (double)creal(lambda), (double)cimag(lambda), (double)mu );
   }
diff --git a/src/simd_avx_intrinsic.h b/src/simd_avx_intrinsic.h
new file mode 100644
index 0000000..f017e80
--- /dev/null
+++ b/src/simd_avx_intrinsic.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 2016 Simone Bacchio.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef SIMD_AVX_INTRINSIC_HEADER
+#define SIMD_AVX_INTRINSIC_HEADER
+
+#include "immintrin.h"
+#include "xmmintrin.h"
+#include "emmintrin.h"
+#include "pmmintrin.h"
+
+#define SIMD               _AVX
+#define SIMD_LENGTH_float  8
+#define SIMD_LENGTH_double 4
+#define mm_FOR_float(e)  { e e e e  e e e e }
+#define mm_FOR_double(e) { e e e e }
+
+#define mm_float  __m256
+#define mm_double __m256d
+
+#define mm_mul_float  _mm256_mul_ps
+#define mm_mul_double _mm256_mul_pd
+#define mm_add_float  _mm256_add_ps
+#define mm_add_double _mm256_add_pd
+#define mm_sub_float  _mm256_sub_ps
+#define mm_sub_double _mm256_sub_pd
+#define mm_and_float  _mm256_and_ps
+#define mm_and_double _mm256_and_pd
+
+#define mm_setzero_float   _mm256_setzero_ps
+#define mm_setzero_double  _mm256_setzero_pd
+#define mm_setr_float      _mm256_setr_ps
+#define mm_setr_double     _mm256_setr_pd
+#define mm_set1_float      _mm256_set1_ps
+#define mm_set1_double     _mm256_set1_pd
+#define mm_load_float      _mm256_load_ps
+#define mm_load_double     _mm256_load_pd
+#define mm_unpacklo_float  _mm256_unpacklo_ps
+#define mm_unpacklo_double _mm256_unpacklo_pd
+#define mm_unpackhi_float  _mm256_unpackhi_ps
+#define mm_unpackhi_double _mm256_unpackhi_pd
+#define mm_store_float    _mm256_store_ps
+#define mm_store_double   _mm256_store_pd
+
+#ifdef _FMA_
+
+#define mm_fmadd_float   _mm256_fmadd_ps
+#define mm_fmadd_double  _mm256_fmadd_pd
+#define mm_fnmadd_float  _mm256_fnmadd_ps
+#define mm_fnmadd_double _mm256_fnmadd_pd
+#define mm_fmsub_float   _mm256_fmsub_ps
+#define mm_fmsub_double  _mm256_fmsub_pd
+#define mm_fnmsub_float  _mm256_fnmsub_ps
+#define mm_fnmsub_double _mm256_fnmsub_pd
+
+#endif
+
+// Load even components
+static inline mm_float mm_seti_float( float *data, const int i ) {
+  return mm_setr_float( data[0*i], data[1*i], data[2*i], data[3*i], data[4*i], data[5*i], data[6*i], data[7*i] );
+}
+static inline mm_double mm_seti_double( double *data, const int i ) {
+  return mm_setr_double( data[0*i], data[1*i], data[2*i], data[3*i] );
+}
+
+// Loading 6 time the same component and then jumping 12 components 
+static inline void mm_set1_6times_float( float *data, mm_float *pack1of3, mm_float *pack2of3,
+                                          mm_float *pack3of3, const int skip ) {
+  *pack1of3 = mm_setr_float( data[0*i+0*skip], data[1*i+0*skip], data[2*i+0*skip], data[3*i+0*skip],
+                             data[4*i+0*skip], data[5*i+0*skip], data[0*i+1*skip], data[1*i+1*skip] );
+  *pack2of3 = mm_setr_float( data[2*i+1*skip], data[3*i+1*skip], data[4*i+1*skip], data[5*i+1*skip],
+                             data[0*i+2*skip], data[1*i+2*skip], data[2*i+2*skip], data[3*i+2*skip] );
+  *pack3of3 = mm_setr_float( data[4*i+2*skip], data[5*i+2*skip], data[0*i+3*skip], data[1*i+3*skip],
+                             data[2*i+3*skip], data[3*i+3*skip], data[4*i+3*skip], data[5*i+3*skip] );
+}
+static inline void mm_loadi_6times_double( double *data, mm_double *pack1of3, mm_double *pack2of3,
+                                           mm_double *pack3of3, const int i, const int skip ) {
+  *pack1of3 = mm_setr_double( data[0*i+0*skip], data[1*i+0*skip], data[2*i+0*skip], data[3*i+0*skip] );
+  *pack2of3 = mm_setr_double( data[4*i+0*skip], data[5*i+0*skip], data[0*i+1*skip], data[1*i+1*skip] );
+  *pack3of3 = mm_setr_double( data[2*i+1*skip], data[3*i+1*skip], data[4*i+1*skip], data[5*i+1*skip] );
+}
+
+static inline mm_float mm_set_from_list_float( float *data, float *alpha, int *list ) {
+  return mm_setr_float( alpha[0]*data[list[0]], alpha[1]*data[list[1]], alpha[2]*data[list[2]], alpha[3]*data[list[3]],
+                        alpha[4]*data[list[4]], alpha[5]*data[list[5]], alpha[6]*data[list[6]], alpha[7]*data[list[7]] );
+}
+static inline mm_double mm_set_from_list_double( double *data, double *alpha, int *list ) {
+  return mm_setr_double( alpha[0]*data[list[0]], alpha[1]*data[list[1]], alpha[2]*data[list[2]], alpha[3]*data[list[3]] );
+}
+
+// Sum all components of mm_PRECISION
+static inline float mm_reduce_add_float( mm_float v) {
+  __m128 vlow  = _mm256_castps256_ps128(v);
+  __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
+  vlow         = _mm_add_ps(vlow, vhigh);     // add the low 128
+  // same of SSE
+  __m128 shuf  = _mm_movehdup_ps(v);          // broadcast elements 3,1 to 2,0
+  __m128 sums  = _mm_add_ps(v, shuf);
+  shuf         = _mm_movehl_ps(shuf, sums);   // high half -> low half
+  sums         = _mm_add_ss(sums, shuf);
+  return       _mm_cvtss_f32(sums);
+}
+static inline double mm_reduce_add_double( mm_double v ) {
+  __m128d vlow  = _mm256_castpd256_pd128(v);
+  __m128d vhigh = _mm256_extractf128_pd(v, 1);
+  vlow          = _mm_add_pd(vlow, vhigh);
+  // same of SSE
+  double tmp;
+  _mm_storeh_pd(&tmp, vlow);        // store the high half
+  return _mm_cvtsd_f64(vlow) + tmp; // cast the low half and sum
+}
+
+// Transpose a block of SIMD_LENGTH * SIMD_LENGTH
+static inline void mm_transpose_float( mm_float *data ) {
+  mm_float __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
+  mm_float __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
+  __t0 = _mm256_unpacklo_ps(data[0], data[1]);
+  __t1 = _mm256_unpackhi_ps(data[0], data[1]);
+  __t2 = _mm256_unpacklo_ps(data[2], data[3]);
+  __t3 = _mm256_unpackhi_ps(data[2], data[3]);
+  __t4 = _mm256_unpacklo_ps(data[4], data[5]);
+  __t5 = _mm256_unpackhi_ps(data[4], data[5]);
+  __t6 = _mm256_unpacklo_ps(data[6], data[7]);
+  __t7 = _mm256_unpackhi_ps(data[6], data[7]);
+  __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
+  __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
+  __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
+  __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
+  __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
+  __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
+  __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
+  __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
+  data[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
+  data[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
+  data[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
+  data[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
+  data[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
+  data[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
+  data[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
+  data[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
+}
+static inline void mm_transpose_double( mm_double *data)
+{
+   mm_double tmp[4];
+
+   tmp[0] = _mm256_unpacklo_pd( data[0], data[1] );
+   tmp[1] = _mm256_unpacklo_pd( data[2], data[3] );
+   tmp[2] = _mm256_unpackhi_pd( data[0], data[1] );
+   tmp[3] = _mm256_unpackhi_pd( data[2], data[3] );
+   //TODO
+   data[0] = _mm256_movelh_pd( tmp[0], tmp[1] );
+   data[1] = _mm256_movehl_pd( tmp[1], tmp[0] );
+   data[2] = _mm256_movelh_pd( tmp[2], tmp[3] );
+   data[3] = _mm256_movehl_pd( tmp[3], tmp[2] );
+}
+
+#endif
diff --git a/src/simd_blas_generic.h b/src/simd_blas_generic.h
new file mode 100644
index 0000000..d54b807
--- /dev/null
+++ b/src/simd_blas_generic.h
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef SIMD_BLAS_PRECISION_HEADER
+#define SIMD_BLAS_PRECISION_HEADER
+
+static inline void cgem_inverse_PRECISION( const int N, PRECISION *A_inverse, PRECISION *A, int lda ) {
+
+  // generate LU decomp in A
+  int i, j, k;
+  complex_PRECISION alpha;
+  
+  complex_PRECISION tmpA[N*N];
+  complex_PRECISION tmpA_inverse[N*N];
+  
+  for ( j=0; j<N; j++ ) {
+    for ( i=0; i<N; i++ ) {
+      tmpA[i+N*j] = A[2*j*lda+i] + _Complex_I * A[(2*j+1)*lda+i];
+    }
+  }
+  
+  // LU decomp in A
+  for ( k=0; k<N-1; k++ ) {
+    for ( i=k+1; i<N; i++ ) {
+      // alpha = A_ik/A_kk
+      alpha = tmpA[i+k*N]/tmpA[k+k*N];
+      tmpA[i+k*N] = alpha;
+      for ( j=k+1; j<N; j++ ) {
+        // A_ij = A_ij - alpha * A_kj
+        tmpA[i+j*N] -= alpha* tmpA[k+j*N];
+      }
+    }    
+  } 
+  
+  complex_PRECISION b[N];
+  complex_PRECISION *x;
+  
+  for ( k=0; k<N; k++ ) {
+    b[k] = 0;
+  }
+  
+  for ( k=0; k<N; k++ ) {
+    x = tmpA_inverse+k*N;
+    b[k] = 1;
+    if ( k>0 )
+      b[k-1] = 0;
+    
+    for ( i=0; i<N; i++ ) {
+      x[i] = b[i];
+      for ( j=0; j<i; j++ ) {
+        // x_i = x_i - A_ij + x_j
+        x[i] = x[i] - tmpA[i+j*N]*x[j];
+      }
+    } // i
+    
+    for ( i=N-1; i>=0; i-- ) {
+      for ( j=i+1; j<N; j++ ) {
+        // x_i = x_i - A_ij * x_j
+        x[i] = x[i] - tmpA[i+j*N]*x[j];
+      }
+      // x_i = x_i / A_ii
+      x[i] = x[i]/tmpA[i+i*N];
+    } // i
+  } // k
+  
+  for ( j=0; j<N; j++ ) {
+    for ( i=0; i<N; i++ ) {
+      A_inverse[i+2*j*lda] = creal(tmpA_inverse[i+j*N]);
+      A_inverse[i+(2*j+1)*lda] = cimag(tmpA_inverse[i+j*N]);
+    }
+    for ( i=N; i<lda; i++ ) {
+      A_inverse[i+2*j*lda] = 0.0;
+      A_inverse[i+(2*j+1)*lda] = 0.0;
+    }
+  } 
+}
+
+
+static inline void cgemv_PRECISION( const int N, const OPERATOR_TYPE_PRECISION *A, int lda,
+                                    const PRECISION *B, PRECISION *C ) {
+  int i, j;
+  
+  mm_PRECISION A_re;
+  mm_PRECISION A_im;
+  mm_PRECISION B_re;
+  mm_PRECISION B_im;
+  mm_PRECISION C_re[lda/SIMD_LENGTH_PRECISION];
+  mm_PRECISION C_im[lda/SIMD_LENGTH_PRECISION];
+  
+  // deinterleaved load
+  for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION )
+    cload_PRECISION( C+2*i, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]));
+  
+  for ( j=0; j<N; j++ ) {
+    // load the j-th complex number in B
+    B_re = mm_set1_PRECISION( B[2*j] );
+    B_im = mm_set1_PRECISION( B[2*j+1] );
+    
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+       A_re = mm_load_PRECISION( A + 2*j*lda + i );
+       A_im = mm_load_PRECISION( A + (2*j+1)*lda + i );
+       
+       // C += A*B
+       cfmadd_PRECISION( A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]));
+    }
+  }  
+  
+  // interleaves real and imaginary parts and stores the resulting complex numbers in C
+  for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION )
+    cstore_PRECISION( C+2*i, C_re[i/SIMD_LENGTH_PRECISION], C_im[i/SIMD_LENGTH_PRECISION] );
+}
+
+static inline void cgenmv_PRECISION( const int N, const OPERATOR_TYPE_PRECISION *A, int lda,
+                                    const PRECISION *B, PRECISION *C ) {
+  int i, j;
+  
+  mm_PRECISION A_re;
+  mm_PRECISION A_im;
+  mm_PRECISION B_re;
+  mm_PRECISION B_im;
+  mm_PRECISION C_re[lda/SIMD_LENGTH_PRECISION];
+  mm_PRECISION C_im[lda/SIMD_LENGTH_PRECISION];
+  
+  // deinterleaved load
+  for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION )
+    cload_PRECISION( C+2*i, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]));
+  
+  for ( j=0; j<N; j++ ) {
+    // load the j-th complex number in B
+    B_re = mm_set1_PRECISION( B[2*j] );
+    B_im = mm_set1_PRECISION( B[2*j+1] );
+    
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+       A_re = mm_load_PRECISION( A + 2*j*lda + i );
+       A_im = mm_load_PRECISION( A + (2*j+1)*lda + i );
+       
+       // C += A*B
+       cfnmadd_PRECISION( A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]));
+    }
+  }  
+  
+  // interleaves real and imaginary parts and stores the resulting complex numbers in C
+  for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION )
+    cstore_PRECISION( C+2*i, C_re[i/SIMD_LENGTH_PRECISION], C_im[i/SIMD_LENGTH_PRECISION] );
+}
+
+static inline void cgemv_padded_PRECISION( const int N, const OPERATOR_TYPE_PRECISION *A, int lda, int padded,
+                                           const PRECISION *B, PRECISION *C ) {
+  int i, j, ip;
+
+  int offset = SIMD_LENGTH_PRECISION*((padded+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    int jp;
+    mm_PRECISION A_re;
+    mm_PRECISION A_im;
+    mm_PRECISION B1_re;
+    mm_PRECISION B1_im;
+    mm_PRECISION B2_re;
+    mm_PRECISION B2_im;
+    mm_PRECISION C1_re[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C1_im[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C2_re[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C2_im[lda/SIMD_LENGTH_PRECISION];
+    
+    // deinterleaved load
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+      ip = i%offset + 2*(i/offset)*padded;
+      cload_PRECISION( C+2*ip, &(C1_re[i/SIMD_LENGTH_PRECISION]), &(C1_im[i/SIMD_LENGTH_PRECISION]));
+      cload_PRECISION( C+2*(ip+padded), &(C2_re[i/SIMD_LENGTH_PRECISION]), &(C2_im[i/SIMD_LENGTH_PRECISION]));
+    }
+    
+    for ( j=0; j<N; j++ ) {
+      // load the j-th complex number in B
+      jp = j + (j/padded)*padded;
+      B1_re = mm_set1_PRECISION( B[2*jp] );
+      B1_im = mm_set1_PRECISION( B[2*jp+1] );
+      B2_re = mm_set1_PRECISION( B[2*(jp+padded)] );
+      B2_im = mm_set1_PRECISION( B[2*(jp+padded)+1] );
+      
+      for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+        A_re = mm_load_PRECISION( A + 2*j*lda + i );
+        A_im = mm_load_PRECISION( A + (2*j+1)*lda + i );
+        
+        // C += A*B
+        cfmadd_PRECISION(A_re, A_im, B1_re, B1_im, &(C1_re[i/SIMD_LENGTH_PRECISION]),
+                         &(C1_im[i/SIMD_LENGTH_PRECISION]) );
+        cfmadd_PRECISION(A_re, A_im, B2_re, B2_im, &(C2_re[i/SIMD_LENGTH_PRECISION]),
+                         &(C2_im[i/SIMD_LENGTH_PRECISION]) );
+      }
+    }  
+    
+    // interleaves real and imaginary parts and stores the resulting complex numbers in C
+    for ( j=0; j<lda/offset; j++ ) {
+      // we save it from last to first in order to avoid overriting issues.
+      for ( i = (j+1)*offset-SIMD_LENGTH_PRECISION; i >= j*offset; i -= SIMD_LENGTH_PRECISION ) {
+        ip = i%offset + 2*(i/offset)*padded;
+        cstore_PRECISION( C+2*ip, C1_re[i/SIMD_LENGTH_PRECISION], C1_im[i/SIMD_LENGTH_PRECISION] );
+        cstore_PRECISION( C+2*(ip+padded), C2_re[i/SIMD_LENGTH_PRECISION], C2_im[i/SIMD_LENGTH_PRECISION] );
+      }
+    }
+  } else {
+#endif
+    mm_PRECISION A_re;
+    mm_PRECISION A_im;
+    mm_PRECISION B_re;
+    mm_PRECISION B_im;
+    mm_PRECISION C_re[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C_im[lda/SIMD_LENGTH_PRECISION];
+    
+    // deinterleaved load
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+      ip = i%offset + (i/offset)*padded;
+      cload_PRECISION( C+2*ip, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]));
+    }
+    
+    for ( j=0; j<N; j++ ) {
+      // load the j-th complex number in B
+      B_re = mm_set1_PRECISION( B[2*j] );
+      B_im = mm_set1_PRECISION( B[2*j+1] );
+      
+      for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+        A_re = mm_load_PRECISION( A + 2*j*lda + i );
+        A_im = mm_load_PRECISION( A + (2*j+1)*lda + i );
+        
+        // C += A*B
+        cfmadd_PRECISION(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]) );
+      }
+    }  
+    
+    // interleaves real and imaginary parts and stores the resulting complex numbers in C
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+      ip = i%offset + (i/offset)*padded;
+      cstore_PRECISION( C+2*ip, C_re[i/SIMD_LENGTH_PRECISION], C_im[i/SIMD_LENGTH_PRECISION] );
+    }
+#ifdef HAVE_TM1p1
+  }
+#endif
+}
+
+
+static inline void cgenmv_padded_PRECISION( const int N, const OPERATOR_TYPE_PRECISION *A, int lda, int padded,
+                                           const PRECISION *B, PRECISION *C ) {
+  int i, j, ip;
+
+  int offset = SIMD_LENGTH_PRECISION*((padded+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    int jp;
+    mm_PRECISION A_re;
+    mm_PRECISION A_im;
+    mm_PRECISION B1_re;
+    mm_PRECISION B1_im;
+    mm_PRECISION B2_re;
+    mm_PRECISION B2_im;
+    mm_PRECISION C1_re[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C1_im[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C2_re[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C2_im[lda/SIMD_LENGTH_PRECISION];
+    
+    // deinterleaved load
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+      ip = i%offset + 2*(i/offset)*padded;
+      cload_PRECISION( C+2*ip, &(C1_re[i/SIMD_LENGTH_PRECISION]), &(C1_im[i/SIMD_LENGTH_PRECISION]));
+      cload_PRECISION( C+2*(ip+padded), &(C2_re[i/SIMD_LENGTH_PRECISION]), &(C2_im[i/SIMD_LENGTH_PRECISION]));
+    }
+    
+    for ( j=0; j<N; j++ ) {
+      // load the j-th complex number in B
+      jp = j + (j/padded)*padded;
+      B1_re = mm_set1_PRECISION( B[2*jp] );
+      B1_im = mm_set1_PRECISION( B[2*jp+1] );
+      B2_re = mm_set1_PRECISION( B[2*(jp+padded)] );
+      B2_im = mm_set1_PRECISION( B[2*(jp+padded)+1] );
+      
+      for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+        A_re = mm_load_PRECISION( A + 2*j*lda + i );
+        A_im = mm_load_PRECISION( A + (2*j+1)*lda + i );
+        
+        // C += A*B
+        cfnmadd_PRECISION(A_re, A_im, B1_re, B1_im, &(C1_re[i/SIMD_LENGTH_PRECISION]),
+                         &(C1_im[i/SIMD_LENGTH_PRECISION]) );
+        cfnmadd_PRECISION(A_re, A_im, B2_re, B2_im, &(C2_re[i/SIMD_LENGTH_PRECISION]),
+                         &(C2_im[i/SIMD_LENGTH_PRECISION]) );
+      }
+    }  
+    
+    // interleaves real and imaginary parts and stores the resulting complex numbers in C
+    for ( j=0; j<lda/offset; j++ ) {
+      // we save it from last to first in order to avoid overriting issues.
+      for ( i = (j+1)*offset-SIMD_LENGTH_PRECISION; i >= j*offset; i -= SIMD_LENGTH_PRECISION ) {
+        ip = i%offset + 2*(i/offset)*padded;
+        cstore_PRECISION( C+2*ip, C1_re[i/SIMD_LENGTH_PRECISION], C1_im[i/SIMD_LENGTH_PRECISION] );
+        cstore_PRECISION( C+2*(ip+padded), C2_re[i/SIMD_LENGTH_PRECISION], C2_im[i/SIMD_LENGTH_PRECISION] );
+      }
+    }
+  } else {
+#endif
+    mm_PRECISION A_re;
+    mm_PRECISION A_im;
+    mm_PRECISION B_re;
+    mm_PRECISION B_im;
+    mm_PRECISION C_re[lda/SIMD_LENGTH_PRECISION];
+    mm_PRECISION C_im[lda/SIMD_LENGTH_PRECISION];
+    
+    // deinterleaved load
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+      ip = i%offset + (i/offset)*padded;
+      cload_PRECISION( C+2*ip, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]));
+    }
+    
+    for ( j=0; j<N; j++ ) {
+      // load the j-th complex number in B
+      B_re = mm_set1_PRECISION( B[2*j] );
+      B_im = mm_set1_PRECISION( B[2*j+1] );
+      
+      for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+        A_re = mm_load_PRECISION( A + 2*j*lda + i );
+        A_im = mm_load_PRECISION( A + (2*j+1)*lda + i );
+        
+        // C += A*B
+        cfnmadd_PRECISION(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_PRECISION]), &(C_im[i/SIMD_LENGTH_PRECISION]) );
+      }
+    }  
+    
+    // interleaves real and imaginary parts and stores the resulting complex numbers in C
+    for ( i=0; i<lda; i+= SIMD_LENGTH_PRECISION ) {
+      ip = i%offset + (i/offset)*padded;
+      cstore_PRECISION( C+2*ip, C_re[i/SIMD_LENGTH_PRECISION], C_im[i/SIMD_LENGTH_PRECISION] );
+    }
+#ifdef HAVE_TM1p1
+  }
+#endif
+}
+
+#endif
diff --git a/src/simd_complex_generic.h b/src/simd_complex_generic.h
new file mode 100644
index 0000000..ff48b5a
--- /dev/null
+++ b/src/simd_complex_generic.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef SIMD_COMPLEX_PRECISION_HEADER
+#define SIMD_COMPLEX_PRECISION_HEADER
+
+// c = a*b
+static inline void cmul_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    *c_real = mm_mul_PRECISION(a_imag, b_imag);
+    *c_imag = mm_mul_PRECISION(a_imag, b_real);
+    *c_real = mm_fmsub_PRECISION(a_real, b_real, *c_real);
+    *c_imag = mm_fmadd_PRECISION(a_real, b_imag, *c_imag);
+}
+
+// c = -a*b
+static inline void cnmul_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    *c_real = mm_mul_PRECISION(a_imag, b_imag);
+    *c_imag = mm_mul_PRECISION(a_imag, b_real);
+    *c_real = mm_fnmadd_PRECISION(a_real, b_real, *c_real);
+    *c_imag = mm_fnmsub_PRECISION(a_real, b_imag, *c_imag);
+}
+
+// c = conj(a)*b
+static inline void cmul_conj_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    *c_real = mm_mul_PRECISION(a_imag, b_imag);
+    *c_imag = mm_mul_PRECISION(a_imag, b_real);
+    *c_real = mm_fmadd_PRECISION(a_real, b_real, *c_real);
+    *c_imag = mm_fmsub_PRECISION(a_real, b_imag, *c_imag);
+}
+
+// c = -conj(a)*b
+static inline void cnmul_conj_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    *c_real = mm_mul_PRECISION(a_imag, b_imag);
+    *c_imag = mm_mul_PRECISION(a_imag, b_real);
+    *c_real = mm_fnmsub_PRECISION(a_real, b_real, *c_real);
+    *c_imag = mm_fnmadd_PRECISION(a_real, b_imag, *c_imag);
+}
+
+// c = a*b + c
+static inline void cfmadd_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+  *c_real = mm_fmsub_PRECISION( a_imag, b_imag, *c_real );
+  *c_imag = mm_fmadd_PRECISION( a_imag, b_real, *c_imag );
+  *c_real = mm_fmsub_PRECISION( a_real, b_real, *c_real );
+  *c_imag = mm_fmadd_PRECISION( a_real, b_imag, *c_imag );
+}
+
+// c = -a*b + c
+static inline void masked_cfnmadd_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag, mm_PRECISION mask )
+{
+    mm_PRECISION minus_a_real; mm_PRECISION minus_a_imag;
+    minus_a_real = mm_setzero_PRECISION();
+    minus_a_imag = mm_sub_PRECISION( minus_a_real, a_imag );
+    minus_a_real = mm_sub_PRECISION( minus_a_real, a_real );
+    minus_a_real = mm_and_PRECISION( minus_a_real, mask );
+    minus_a_imag = mm_and_PRECISION( minus_a_imag, mask );
+    
+    *c_real = mm_fmsub_PRECISION(minus_a_imag, b_imag, *c_real);
+    *c_imag = mm_fmadd_PRECISION(minus_a_imag, b_real, *c_imag);
+    *c_real = mm_fmsub_PRECISION(minus_a_real, b_real, *c_real);
+    *c_imag = mm_fmadd_PRECISION(minus_a_real, b_imag, *c_imag);
+}
+
+// c = -a*b + c
+static inline void cfnmadd_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    mm_PRECISION minus_a_real; mm_PRECISION minus_a_imag;
+    minus_a_real = mm_setzero_PRECISION();
+    minus_a_imag = mm_sub_PRECISION( minus_a_real, a_imag );
+    minus_a_real = mm_sub_PRECISION( minus_a_real, a_real );
+    
+    *c_real = mm_fmsub_PRECISION(minus_a_imag, b_imag, *c_real);
+    *c_imag = mm_fmadd_PRECISION(minus_a_imag, b_real, *c_imag);
+    *c_real = mm_fmsub_PRECISION(minus_a_real, b_real, *c_real);
+    *c_imag = mm_fmadd_PRECISION(minus_a_real, b_imag, *c_imag);
+}
+
+// c = conj(a)*b + c
+static inline void cfmadd_conj_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    *c_real = mm_fmadd_PRECISION(a_imag, b_imag, *c_real);
+    *c_imag = mm_fmsub_PRECISION(a_imag, b_real, *c_imag);
+    *c_real = mm_fmadd_PRECISION(a_real, b_real, *c_real);
+    *c_imag = mm_fmsub_PRECISION(a_real, b_imag, *c_imag);
+}
+
+// c = -conj(a)*b + c
+static inline void cfnmadd_conj_PRECISION(
+        mm_PRECISION a_real, mm_PRECISION a_imag,
+        mm_PRECISION b_real, mm_PRECISION b_imag,
+        mm_PRECISION *c_real, mm_PRECISION *c_imag)
+{
+    mm_PRECISION minus_a_real; mm_PRECISION minus_a_imag;
+    minus_a_real = mm_setzero_PRECISION();
+    minus_a_imag = mm_sub_PRECISION( minus_a_real, a_imag );
+    minus_a_real = mm_sub_PRECISION( minus_a_real, a_real );
+  
+    *c_real = mm_fmadd_PRECISION(minus_a_imag, b_imag, *c_real);
+    *c_imag = mm_fmsub_PRECISION(minus_a_imag, b_real, *c_imag);
+    *c_real = mm_fmadd_PRECISION(minus_a_real, b_real, *c_real);
+    *c_imag = mm_fmsub_PRECISION(minus_a_real, b_imag, *c_imag);
+}
+
+static inline void cload_PRECISION( PRECISION *data, mm_PRECISION *result_re, mm_PRECISION *result_im  ) {  
+  *result_re = mm_seti_PRECISION( data+0, 2 );
+  *result_im = mm_seti_PRECISION( data+1, 2 );
+}
+
+
+static inline void cstore_PRECISION( PRECISION *result, mm_PRECISION data_re, mm_PRECISION data_im ) { 
+  mm_store_PRECISION( result,                       mm_unpacklo_PRECISION( data_re, data_im ) );
+  mm_store_PRECISION( result+SIMD_LENGTH_PRECISION, mm_unpackhi_PRECISION( data_re, data_im ) );
+}
+
+#endif
diff --git a/src/simd_sse_intrinsic.h b/src/simd_sse_intrinsic.h
new file mode 100644
index 0000000..b8c3fc9
--- /dev/null
+++ b/src/simd_sse_intrinsic.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2016 Simone Bacchio.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef SIMD_SSE_INTRINSIC_HEADER
+#define SIMD_SEE_INTRINSIC_HEADER
+
+#include "xmmintrin.h"
+#include "emmintrin.h"
+#include "pmmintrin.h"
+
+#define SIMD               _SSE
+#define SIMD_LENGTH_float  4
+#define SIMD_LENGTH_double 2
+
+#define mm_float  __m128
+#define mm_double __m128d
+
+#define mm_mul_float  _mm_mul_ps
+#define mm_mul_double _mm_mul_pd
+#define mm_add_float  _mm_add_ps
+#define mm_add_double _mm_add_pd
+#define mm_sub_float  _mm_sub_ps
+#define mm_sub_double _mm_sub_pd
+#define mm_and_float  _mm_and_ps
+#define mm_and_double _mm_and_pd
+
+#define mm_setzero_float   _mm_setzero_ps
+#define mm_setzero_double  _mm_setzero_pd
+#define mm_setr_float      _mm_setr_ps
+#define mm_setr_double     _mm_setr_pd
+#define mm_set1_float      _mm_set1_ps
+#define mm_set1_double     _mm_set1_pd
+#define mm_load_float      _mm_load_ps
+#define mm_load_double     _mm_load_pd
+#define mm_unpacklo_float  _mm_unpacklo_ps
+#define mm_unpacklo_double _mm_unpacklo_pd
+#define mm_unpackhi_float  _mm_unpackhi_ps
+#define mm_unpackhi_double _mm_unpackhi_pd
+#define mm_store_float    _mm_store_ps
+#define mm_store_double   _mm_store_pd
+
+#ifdef __FMA__
+
+#include "immintrin.h"
+
+#define mm_fmadd_float   _mm_fmadd_ps
+#define mm_fmadd_double  _mm_fmadd_pd
+#define mm_fnmadd_float  _mm_fnmadd_ps
+#define mm_fnmadd_double _mm_fnmadd_pd
+#define mm_fmsub_float   _mm_fmsub_ps
+#define mm_fmsub_double  _mm_fmsub_pd
+#define mm_fnmsub_float  _mm_fnmsub_ps
+#define mm_fnmsub_double _mm_fnmsub_pd
+
+#endif
+
+// set components from data, with increment i
+static inline mm_float mm_seti_float( float *data, const int i ) {
+  return mm_setr_float( data[0*i], data[1*i], data[2*i], data[3*i] );
+}
+static inline mm_double mm_seti_double( double *data, const int i ) {
+  return mm_setr_double( data[0*i], data[1*i] );
+}
+
+// Loading 6 times the same component and then jumping *skip* components 
+static inline void mm_loadi_6times_float( float *data, mm_float *pack1of3, mm_float *pack2of3,
+                                         mm_float *pack3of3, const int i, const int skip ) {
+  *pack1of3 = mm_setr_float( data[0*i+0*skip], data[1*i+0*skip], data[2*i+0*skip], data[3*i+0*skip] );
+  *pack2of3 = mm_setr_float( data[4*i+0*skip], data[5*i+0*skip], data[0*i+1*skip], data[1*i+1*skip] );
+  *pack3of3 = mm_setr_float( data[2*i+1*skip], data[3*i+1*skip], data[4*i+1*skip], data[5*i+1*skip] );
+}
+static inline void mm_loadi_6times_double( double *data, mm_double *pack1of3, mm_double *pack2of3,
+                                           mm_double *pack3of3, const int i, const int skip ) {
+  *pack1of3 = mm_setr_double( data[0*i+0*skip], data[1*i+0*skip] );
+  *pack2of3 = mm_setr_double( data[2*i+0*skip], data[3*i+0*skip] );
+  *pack3of3 = mm_setr_double( data[4*i+0*skip], data[5*i+0*skip] );
+}
+
+// Set from list and scaling with alpha 
+static inline mm_float mm_set_from_list_float( float *data, float *alpha, int *list ) {
+  return mm_setr_float( alpha[0]*data[list[0]], alpha[1]*data[list[1]], alpha[2]*data[list[2]], alpha[3]*data[list[3]] );
+}
+static inline mm_double mm_set_from_list_double( double *data, double *alpha, int *list ) {
+  return mm_setr_double( alpha[0]*data[list[0]], alpha[1]*data[list[1]] );
+}
+
+// Sum all components of mm_PRECISION
+static inline float mm_reduce_add_float( mm_float v ) {
+  mm_float shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
+  mm_float sums = _mm_add_ps(v, shuf);
+  shuf          = _mm_movehl_ps(shuf, sums); // high half -> low half
+  sums          = _mm_add_ss(sums, shuf);
+  return        _mm_cvtss_f32(sums);
+}
+static inline double mm_reduce_add_double( mm_double v ) {
+  double tmp;
+  _mm_storeh_pd(&tmp, v);        // store the high half
+  return _mm_cvtsd_f64(v) + tmp; // cast the low half and sum
+}
+
+// Transpose a block of SIMD_LENGTH * SIMD_LENGTH
+static inline void mm_transpose_float( mm_float *data ) {
+  _MM_TRANSPOSE4_PS(data[0],data[1],data[2],data[3]);
+}
+static inline void mm_transpose_double( mm_double *data ) {
+  double tmp01, tmp10 = _mm_cvtsd_f64(data[1]);
+  _mm_storeh_pd(&tmp01, data[0]);
+  _mm_loadl_pd(data[1], &tmp01);
+  _mm_loadh_pd(data[0], &tmp10);
+}
+
+#endif 
diff --git a/src/simd_vectorization_control.h b/src/simd_vectorization_control.h
new file mode 100644
index 0000000..0fbee29
--- /dev/null
+++ b/src/simd_vectorization_control.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
+ * 
+ * This file is part of the DDalphaAMG solver library.
+ * 
+ * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * The DDalphaAMG solver library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
+ * 
+ */
+
+#ifndef SIMD_VECTORIZATION_CONTROL_HEADER
+#define SIMD_VECTORIZATION_CONTROL_HEADER
+
+#ifdef        NOT_YET//__AVX__
+#include "simd_avx_intrinsic.h"
+#elif defined SSE //__SSE__
+#include "simd_sse_intrinsic.h"
+#endif
+
+#ifdef        SIMD
+
+#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float
+#define OPTIMIZED_COARSE_SELF_COUPLING_float
+#define OPTIMIZED_INTERPOLATION_OPERATOR_float
+#define OPTIMIZED_INTERPOLATION_SETUP_float
+#define OPTIMIZED_NEIGHBOR_COUPLING_double
+#define OPTIMIZED_NEIGHBOR_COUPLING_float
+//#define OPTIMIZED_SELF_COUPLING_double
+#define OPTIMIZED_SELF_COUPLING_float
+#define OPTIMIZED_LINALG_float
+#define OPTIMIZED_LINALG_double
+
+#define OPERATOR_COMPONENT_OFFSET_float  (SIMD_LENGTH_float *((l->num_eig_vect+SIMD_LENGTH_float -1)/SIMD_LENGTH_float ))
+#define OPERATOR_COMPONENT_OFFSET_double (SIMD_LENGTH_double*((l->num_eig_vect+SIMD_LENGTH_double-1)/SIMD_LENGTH_double))
+
+#define OPERATOR_TYPE_float float
+#define OPERATOR_TYPE_double double
+
+#endif
+
+#ifndef       __FMA__
+// a*b + c
+static inline mm_double mm_fmadd_double( mm_double a, mm_double b, mm_double c ) {
+  return mm_add_double( mm_mul_double( a, b ), c );
+}
+static inline mm_float mm_fmadd_float( mm_float a, mm_float b, mm_float c ) {
+  return mm_add_float( mm_mul_float( a, b ), c );
+}
+
+// -a*b + c
+static inline mm_double mm_fnmadd_double( mm_double a, mm_double b, mm_double c ) {
+  return mm_sub_double( c, mm_mul_double( a, b ) );
+}
+static inline mm_float mm_fnmadd_float( mm_float a, mm_float b, mm_float c ) {
+  return mm_sub_float( c, mm_mul_float( a, b ) );
+}
+
+// a*b - c
+static inline mm_double mm_fmsub_double( mm_double a, mm_double b, mm_double c ) {
+  return mm_sub_double( mm_mul_double( a, b ), c );
+}
+static inline mm_float mm_fmsub_float( mm_float a, mm_float b, mm_float c ) {
+  return mm_sub_float( mm_mul_float( a, b ), c );
+}
+
+// res = -a*b - c
+static inline mm_double mm_fnmsub_double( mm_double a, mm_double b, mm_double c ) {
+  mm_double na = mm_sub_double( mm_setzero_double(), a );
+  return mm_sub_double( mm_mul_double( na, b ), c );
+}
+static inline mm_float mm_fnmsub_float( mm_float a, mm_float b, mm_float c ) {
+  mm_float na = mm_sub_float( mm_setzero_float(), a );
+  return mm_sub_float( mm_mul_float( na, b ), c );
+}
+#endif
+
+#endif // SIMD_VECTORIZATION_CONTROL_HEADER
diff --git a/src/solver_analysis.c b/src/solver_analysis.c
index ab07aa8..325165e 100644
--- a/src/solver_analysis.c
+++ b/src/solver_analysis.c
@@ -24,8 +24,22 @@
 
 void test_routine( level_struct *l, struct Thread *threading ) {
 
-  g.test = 0;
-  if ( g.method > 0 ) {
+  if ( g.method >= 0 ) {
+    START_MASTER(threading)
+    g.test = 0;
+    if ( l->depth == 0 ) {
+#ifdef HAVE_TM1p1
+      if( g.n_flavours==2 )
+        printf0("\nRunning tests with D = TM doublet operator:\n");
+      else
+#endif
+#ifdef HAVE_TM
+        printf0("\nRunning tests with D = TM Wilson operator:\n");
+#else
+      printf0("\nRunning tests with D = Wilson operator:\n");
+#endif
+    }
+    END_MASTER(threading)
     if ( g.mixed_precision ) {
       operator_float_test_routine( &(l->s_float.op), l, threading );
       if ( g.method > 0 && g.method < 4 ) schwarz_float_mvm_testfun( &(l->s_float), l, threading );
@@ -36,25 +50,69 @@ void test_routine( level_struct *l, struct Thread *threading ) {
       if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading );
     }
     
-    if ( g.interpolation ) {
+    if ( g.interpolation && g.method > 0 ) {
       if ( g.mixed_precision )
         coarse_operator_float_test_routine( l, threading );
       else
         coarse_operator_double_test_routine( l, threading );
     }
+    START_MASTER(threading)
+    if (g.test < 1e-5)
+      printf0("TESTS passed, highest error %e < 1e-5\n", g.test);
+    else
+      warning0("some TESTS not passed, highest error %e > 1e-5\n", g.test);
+    printf0("\n");
+    END_MASTER(threading)
   }
 
-  START_LOCKED_MASTER(threading)
-  if (g.test < 1e-5)
-    printf0("TESTS passed, max error %e < 1e-5", g.test);
-  else
-    warning0("some TEST not passed, max error %e > 1e-5", g.test);
-  printf0("\n");
-  prof_init( l );
-  END_LOCKED_MASTER(threading)
 
-  if ( g.restart > 0 )
-    rhs_define( g.p.b, l, threading );
+#ifdef HAVE_TM1p1
+  if( g.n_flavours==1 &&
+      (g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0) ) {
+    
+    if ( g.method >= 0 ) {
+      START_MASTER(threading)
+      g.test = 0;
+      printf0("Running tests with D = TM doublet operator:\n");
+      END_MASTER(threading)
+
+      data_layout_n_flavours( 2, l, threading );
+      
+      if ( g.mixed_precision ) 
+        two_flavours_test_float( &(l->s_float.op), l, threading ); 
+      else 
+        two_flavours_test_double( &(l->s_double.op), l, threading ); 
+      
+      if ( g.mixed_precision ) {
+        operator_float_test_routine( &(l->s_float.op), l, threading );
+        if ( g.method > 0 && g.method < 4 ) schwarz_float_mvm_testfun( &(l->s_float), l, threading );
+        if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_float_test( l, threading );
+      } else {
+        operator_double_test_routine( &(l->s_double.op), l, threading );
+        if ( g.method > 0 && g.method < 4 ) schwarz_double_mvm_testfun( &(l->s_double), l, threading );
+        if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading );
+      }
+      
+      if ( g.interpolation  && g.method > 0 ) {
+        if ( g.mixed_precision )
+          coarse_operator_float_test_routine( l, threading );
+        else
+          coarse_operator_double_test_routine( l, threading );
+      }
+     
+      START_MASTER(threading)
+      if (g.test < 1e-5)
+        printf0("TESTS passed, highest error %e < 1e-5\n", g.test);
+      else
+        warning0("some TESTS not passed, highest error %e > 1e-5\n", g.test);
+      printf0("\n");
+      END_MASTER(threading)
+      
+      data_layout_n_flavours( 1, l, threading );
+    }
+  }
+#endif
+
 }
 
 
diff --git a/src/sse_blas_vectorized.h b/src/sse_blas_vectorized.h
deleted file mode 100644
index a23a9a5..0000000
--- a/src/sse_blas_vectorized.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_BLAS_VECTORIZED_H
-#define SSE_BLAS_VECTORIZED_H
-#ifdef SSE
-
-static inline void sse_cgem_inverse( const int N, float *A_inverse, float *A, int lda ) {
-  // generate LU decomp in A
-  
-  int i, j, k;
-  complex_float alpha;
-  
-  complex_float tmpA[N*N];
-  complex_float tmpA_inverse[N*N];
-  
-  for ( j=0; j<N; j++ ) {
-    for ( i=0; i<N; i++ ) {
-      tmpA[i+N*j] = A[2*j*lda+i] + _Complex_I * A[(2*j+1)*lda+i];
-    }
-  }
-  
-  // LU decomp in A
-  for ( k=0; k<N-1; k++ ) {
-    for ( i=k+1; i<N; i++ ) {
-      // alpha = A_ik/A_kk
-      alpha = tmpA[i+k*N]/tmpA[k+k*N];
-      tmpA[i+k*N] = alpha;
-      for ( j=k+1; j<N; j++ ) {
-        // A_ij = A_ij - alpha * A_kj
-        tmpA[i+j*N] -= alpha* tmpA[k+j*N];
-      }
-    }    
-  } 
-  
-  complex_float b[N];
-  complex_float *x;
-  
-  for ( k=0; k<N; k++ ) {
-    b[k] = 0;
-  }
-  
-  for ( k=0; k<N; k++ ) {
-    x = tmpA_inverse+k*N;
-    b[k] = 1;
-    if ( k>0 )
-      b[k-1] = 0;
-    
-    for ( i=0; i<N; i++ ) {
-      x[i] = b[i];
-      for ( j=0; j<i; j++ ) {
-        // x_i = x_i - A_ij + x_j
-        x[i] = x[i] - tmpA[i+j*N]*x[j];
-      }
-    } // i
-    
-    for ( i=N-1; i>=0; i-- ) {
-      for ( j=i+1; j<N; j++ ) {
-        // x_i = x_i - A_ij * x_j
-        x[i] = x[i] - tmpA[i+j*N]*x[j];
-      }
-      // x_i = x_i / A_ii
-      x[i] = x[i]/tmpA[i+i*N];
-    } // i
-  } // k
-  
-  for ( j=0; j<N; j++ ) {
-    for ( i=0; i<N; i++ ) {
-      A_inverse[i+2*j*lda] = creal(tmpA_inverse[i+j*N]);
-      A_inverse[i+(2*j+1)*lda] = cimag(tmpA_inverse[i+j*N]);
-    }
-    for ( i=N; i<lda; i++ ) {
-      A_inverse[i+2*j*lda] = 0.0;
-      A_inverse[i+(2*j+1)*lda] = 0.0;
-    }
-  } 
-}
-
-
-static inline void sse_cgemv( const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C ) {
-  int i, j;
-  
-  __m128 A_re;
-  __m128 A_im;
-  __m128 B_re;
-  __m128 B_im;
-  __m128 C_re[lda/SIMD_LENGTH_float];
-  __m128 C_im[lda/SIMD_LENGTH_float];
-  
-  // deinterleaved load
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-    C_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i], C[2*i+2], C[2*i+4], C[2*i+6] );
-    C_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i+1], C[2*i+3], C[2*i+5], C[2*i+7] );
-  }
-  
-  for ( j=0; j<N; j++ ) {
-    // load the j-th complex number in B
-    B_re = _mm_set1_ps( B[2*j] );
-    B_im = _mm_set1_ps( B[2*j+1] );
-    
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-       A_re = _mm_load_ps( A + 2*j*lda + i );
-       A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-       
-       // C += A*B
-       cfmadd(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_float]), &(C_im[i/SIMD_LENGTH_float]) );
-    }
-  }  
-  
-  // interleaves real and imaginary parts and stores the resulting complex numbers in C
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-     A_re = _mm_unpacklo_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     A_im = _mm_unpackhi_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     _mm_store_ps( C+2*i,                   A_re );
-     _mm_store_ps( C+2*i+SIMD_LENGTH_float, A_im );
-  }
-}
-
-
-static inline void sse_cgenmv( const int N, const OPERATOR_TYPE_float *A,  int lda, const float *B, float *C ) {
-  int i, j;
-  
-  __m128 A_re;
-  __m128 A_im;
-  __m128 B_re;
-  __m128 B_im;
-  __m128 C_re[lda/SIMD_LENGTH_float];
-  __m128 C_im[lda/SIMD_LENGTH_float];
-  
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-    C_re[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i], C[2*i+2], C[2*i+4], C[2*i+6] );
-    C_im[i/SIMD_LENGTH_float] = _mm_setr_ps(C[2*i+1], C[2*i+3], C[2*i+5], C[2*i+7] );
-  }
-  
-  for ( j=0; j<N; j++ ) {
-    
-    B_re = _mm_set1_ps( B[2*j] );
-    B_im = _mm_set1_ps( B[2*j+1] ); 
-    
-    for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-       A_re = _mm_load_ps( A + 2*j*lda + i );
-       A_im = _mm_load_ps( A + (2*j+1)*lda + i );
-       
-       // C -= A*B
-       cfnmadd(A_re, A_im, B_re, B_im, &(C_re[i/SIMD_LENGTH_float]), &(C_im[i/SIMD_LENGTH_float]) );
-    }
-  }  
-  
-  for ( i=0; i<lda; i+= SIMD_LENGTH_float ) {
-     A_re = _mm_unpacklo_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     A_im = _mm_unpackhi_ps( C_re[i/SIMD_LENGTH_float], C_im[i/SIMD_LENGTH_float] );
-     _mm_store_ps( C+2*i,                   A_re );
-     _mm_store_ps( C+2*i+SIMD_LENGTH_float, A_im );
-  }
-}
-
-#endif
-#endif
diff --git a/src/sse_coarse_operator.h b/src/sse_coarse_operator.h
deleted file mode 100644
index fb23ed2..0000000
--- a/src/sse_coarse_operator.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef COARSE_OPERATOR_SSE_H
-#define COARSE_OPERATOR_SSE_H
-#ifdef SSE
-
-static inline void sse_set_coarse_self_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, level_struct *l, int site, const int n_rhs, complex_float *tmp );
-static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp );
-static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2,
-    complex_float *phi, config_float clover, int elements, level_struct *l );
-
-// not implemented for double precision
-static inline void sse_set_coarse_self_coupling_double( complex_double *spin_0_1, complex_double *spin_2_3,
-    complex_double *V, level_struct *l, int site, const int n_rhs, complex_double *tmp ) {}
-static inline void sse_set_coarse_neighbor_coupling_double( complex_double *spin_0_1, complex_double *spin_2_3,
-    complex_double *V, const int mu, level_struct *l, int site, const int n_rhs, complex_double *tmp ) {}
-static inline void sse_coarse_spinwise_site_self_couplings_double( complex_double *eta1, complex_double *eta2,
-    complex_double *phi, config_double clover, int elements, level_struct *l ) {}
-
-
-static inline void sse_set_coarse_self_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, level_struct *l, int site, const int n_rhs, complex_float *tmp ) {
-
-#ifdef SSE
-  int k, m, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2,
-      offset = l->num_lattice_site_var/2;
-  float *spin_0_1_pt;
-  float *spin_2_3_pt;
-  float *interpolation_data;
-
-  int component_offset = SIMD_LENGTH_float;
-  int fine_components = l->num_lattice_site_var;
-
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    int max = SIMD_LENGTH_float*((n+1+SIMD_LENGTH_float-1)/SIMD_LENGTH_float);
-    spin_0_1_pt = (float *)(spin_0_1 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-    spin_2_3_pt = (float *)(spin_2_3 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-
-
-    // index k used for vectorization
-    // original loop runs to k<=n, we must pad as usual to fill SIMD
-    for ( k=0; k<max; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      // this are the packed indices, which we do not use in tmp
-      //k1 = (n*(n+1))/2;
-      //k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
-      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*SIMD_LENGTH_float*site);
-
-      // A
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // D
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-
-    // index k used for vectorization
-    for ( k=0; k<OPERATOR_COMPONENT_OFFSET_float; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      // this are the packed indices, which we do not use in tmp
-      //k1 = component_offset*(num_eig_vect+1+n);
-      k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*component_offset*site);
-
-      // B
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3,
-    complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp ) {
-
-#ifdef SSE
-  int k, k1, k2, m, num_eig_vect = l->next_level->num_lattice_site_var/2,
-      offset = l->num_lattice_site_var/2;
-
-  float *spin_0_1_pt;
-  float *spin_2_3_pt;
-  float *interpolation_data;
-
-  int component_offset = SIMD_LENGTH_float;
-  int fine_components = l->num_lattice_site_var;
-
-  // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-  //             C D ]                        -B*  D* ]
-  // storage order: A, C, B, D, each column wise
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    spin_0_1_pt = (float *)(spin_0_1 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-    spin_2_3_pt = (float *)(spin_2_3 + (n/SIMD_LENGTH_float)*2*offset*SIMD_LENGTH_float) + n%SIMD_LENGTH_float;
-
-    // index k used for vectorization
-    for ( k=0; k<OPERATOR_COMPONENT_OFFSET_float; k+=SIMD_LENGTH_float ) {
-      __m128 buffer_re;
-      __m128 buffer_im;
-
-      interpolation_data = (float *)(V + k*l->vector_size + fine_components*component_offset*site);
-
-      k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      // A
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // C
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_0_1 is the same for all k => broadcast
-        __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]);
-        __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-
-      k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-      k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float;
-
-      // B
-      buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=0; m<offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-
-      // D
-      buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float);
-      buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float);
-      for ( m=offset; m<2*offset; m++ ) {
-        // spin_2_3 is the same for all k => broadcast
-        __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]);
-        __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]);
-        __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset);
-        __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset);
-
-        cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im);
-      }
-      _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re);
-      _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im);
-    }
-  }
-#endif
-}
-
-
-static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2,
-    complex_float *phi, config_float clover, int elements, level_struct *l ) {
-
-#ifdef SSE
-  int num_eig_vect = l->num_lattice_site_var/2;
-  int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2;
-  complex_float *eta[2] = {eta1, eta2};
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-
-  __m128 clover_re;
-  __m128 clover_im;
-  __m128 in_re;
-  __m128 in_im;
-  __m128 out_re;
-  __m128 out_im;
-
-  // zero output matrices
-  __m128 zero = _mm_setzero_ps();
-  for(int s=0; s<2; s++) {
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int row=0; row<2*num_eig_vect; row++) {
-        _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, zero);
-        _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, zero);
-      }
-    }
-  }
-
-  // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-  eta[1] += num_eig_vect*elements;
-  for(int s=0; s<2; s++) {
-    // A and D: column major hermitian, stored as upper triangular
-    for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<=column; row++) {
-          out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-          clover_re = _mm_set1_ps(creal(clover[(column*column+column)/2+row]));
-          clover_im = _mm_set1_ps(cimag(clover[(column*column+column)/2+row]));
-
-          cfmadd(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-        }
-        for(int row=column+1; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-          clover_re = _mm_set1_ps(creal(clover[(row*row+row)/2+column]));
-          clover_im = _mm_set1_ps(cimag(clover[(row*row+row)/2+column]));
-
-          cfmadd_conj(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    clover += clover_step_size1;
-    phi += num_eig_vect*elements;
-  }
-  // rewind phi back to upper components
-  phi -= 2*num_eig_vect*elements;
-  eta[0] += num_eig_vect*elements;
-  eta[1] -= num_eig_vect*elements;
-  // C = -B^{\dagger}
-  for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-    for(int column=0; column<num_eig_vect; column++) {
-      in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-      in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-      for(int row=0; row<num_eig_vect; row++) {
-        out_re = _mm_load_ps((float *)eta[0] + i + (2*row+0)*elements);
-        out_im = _mm_load_ps((float *)eta[0] + i + (2*row+1)*elements);
-        // load transposed B
-        clover_re = _mm_set1_ps(creal(clover[row*num_eig_vect+column]));
-        clover_im = _mm_set1_ps(cimag(clover[row*num_eig_vect+column]));
-
-        cfnmadd_conj(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)eta[0] + i + (2*row+0)*elements, out_re);
-        _mm_store_ps((float *)eta[0] + i + (2*row+1)*elements, out_im);
-      }
-    }
-  }
-  phi += num_eig_vect*elements;
-  // B
-  for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
-    for(int column=0; column<num_eig_vect; column++) {
-      in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-      in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-      for(int row=0; row<num_eig_vect; row++) {
-        out_re = _mm_load_ps((float *)eta[1] + i + (2*row+0)*elements);
-        out_im = _mm_load_ps((float *)eta[1] + i + (2*row+1)*elements);
-        clover_re = _mm_set1_ps(creal(clover[column*num_eig_vect+row]));
-        clover_im = _mm_set1_ps(cimag(clover[column*num_eig_vect+row]));
-
-        cfmadd(clover_re, clover_im, in_re, in_im, &out_re, &out_im);
-
-        _mm_store_ps((float *)eta[1] + i + (2*row+0)*elements, out_re);
-        _mm_store_ps((float *)eta[1] + i + (2*row+1)*elements, out_im);
-      }
-    }
-  }
-#endif
-}
-
-#endif //SSE
-#endif
\ No newline at end of file
diff --git a/src/sse_coarse_operator_generic.c b/src/sse_coarse_operator_generic.c
deleted file mode 100644
index e61184a..0000000
--- a/src/sse_coarse_operator_generic.c
+++ /dev/null
@@ -1,865 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#include "sse_coarse_operator.h"
-
-void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ) {
-  
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-
-  double t0, t1;
-  t0 = MPI_Wtime();
-
-  int mu, n = l->num_eig_vect, j, num_aggregates = l->is_PRECISION.num_agg,
-      aggregate_sites = l->num_inner_lattice_sites / num_aggregates,
-      clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2,
-      D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4,  // size of links in all 4 directions
-      fine_components = l->num_lattice_site_var;
-
-
-
-  START_LOCKED_MASTER(threading)
-  operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-
-  // each thread loops overs its aggregates and then over internal d.o.f.
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-    for ( j=0; j<D_link_size; j++ )
-      l->next_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO;
-    for ( j=0; j<clover_site_size; j++ )
-      l->next_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO;
-  }
-
-  complex_PRECISION *mpi_buffer = NULL;
-  START_MASTER(threading)
-  MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 );
-  END_MASTER(threading)
-
-  int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]];
-
-  // set up table for direction flags
-  int *flags = direction_flags;
-  if(l->depth == 0) {
-    // even sites
-    for(int t=0; t < l->block_lattice[T]; t++) {
-      for(int z=0; z < l->block_lattice[Z]; z++) {
-        for(int y=0; y < l->block_lattice[Y]; y++) {
-          for(int x=0; x < l->block_lattice[X]/2; x++) {
-            flags[2*X+0] = 1;
-            flags[2*X+1] = 1;
-            if((y+z+t)%2 == 0) {
-              if(x == 0)
-                flags[2*X+0] = 0;
-            } else {
-              if(x == l->block_lattice[X]/2-1)
-                flags[2*X+1] = 0;
-            }
-            flags[2*Y+0] = (y ==                     0)?0:1;
-            flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
-            flags[2*Z+0] = (z ==                     0)?0:1;
-            flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
-            flags[2*T+0] = (t ==                     0)?0:1;
-            flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
-            flags += 8;
-          }
-        }
-      }
-    }
-    // odd sites
-    for(int t=0; t < l->block_lattice[T]; t++) {
-      for(int z=0; z < l->block_lattice[Z]; z++) {
-        for(int y=0; y < l->block_lattice[Y]; y++) {
-          for(int x=0; x < l->block_lattice[X]/2; x++) {
-            flags[2*X+0] = 1;
-            flags[2*X+1] = 1;
-            if((y+z+t)%2 == 1) {
-              if(x == 0)
-                flags[2*X+0] = 0;
-            } else {
-              if(x == l->block_lattice[X]/2-1)
-                flags[2*X+1] = 0;
-            }
-            flags[2*Y+0] = (y ==                     0)?0:1;
-            flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
-            flags[2*Z+0] = (z ==                     0)?0:1;
-            flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
-            flags[2*T+0] = (t ==                     0)?0:1;
-            flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
-            flags += 8;
-          }
-        }
-      }
-    }
-  } else {
-    for(int t=0; t < l->block_lattice[T]; t++) {
-      for(int z=0; z < l->block_lattice[Z]; z++) {
-        for(int y=0; y < l->block_lattice[Y]; y++) {
-          for(int x=0; x < l->block_lattice[X]; x++) {
-            flags[2*X+0] = (x ==                     0)?0:1;
-            flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1;
-            flags[2*Y+0] = (y ==                     0)?0:1;
-            flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1;
-            flags[2*Z+0] = (z ==                     0)?0:1;
-            flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1;
-            flags[2*T+0] = (t ==                     0)?0:1;
-            flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1;
-            flags += 8;
-          }
-        }
-      }
-    }
-  }
-
-  complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
-  complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
-  complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64)));
-
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-
-    // new aggregate is starting, zero out tmp
-    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
-      tmp[i] = 0.0;
-
-    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
-      if(l->depth == 0) {
-        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-          d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-              operator+c*l->vector_size, &(l->s_PRECISION), l, site,
-              direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) );
-      } else {
-        for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-          coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-              operator+c*l->vector_size, &(l->s_PRECISION), l, site,
-              direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) );
-      }
-      set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp );
-    }
-
-    // aggregate is done, finalize
-    set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp );
-
-  }
-
-
-  SYNC_HYPERTHREADS(threading)
-  START_LOCKED_MASTER(threading)
-  // neighbors
-  for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION ) {
-    for ( mu=0; mu<4; mu++ ) {
-      // determine start of buffer for this mu
-      int start = 0;
-      for ( int j=0; j<mu; j++ )
-        start += l->s_PRECISION.op.c.num_boundary_sites[2*j];
-
-      // update ghost cells of V[i]
-      negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l,
-          SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION );
-    }
-    for ( mu=0; mu<4; mu++ ) {
-      // finish updating ghostcells of V[i]
-      negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l );
-    }
-  }
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-
-
-  for ( int a=threading->n_core*threading->thread+threading->core; a<num_aggregates; a+=threading->n_core*threading->n_thread ) {
-
-    // new aggregate is starting, zero out tmp
-    for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++)
-      tmp[i] = 0.0;
-
-    for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) {
-      for ( mu=0; mu<4; mu++ ) {
-        if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0)
-          continue;
-
-        if(l->depth == 0)
-          for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-            d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-                operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site );
-        else
-          for ( int c=0; c<l->num_eig_vect; c+=SIMD_LENGTH_PRECISION )
-            coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components,
-                operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site );
-        set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION );
-      }
-    }
-
-    // aggregate is done, finalize
-    for ( mu=0; mu<4; mu++ )
-      set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION );
-
-  }
-  START_MASTER(threading)
-  FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) );
-
-  t1 = MPI_Wtime();
-  if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 );
-  END_MASTER(threading)
-
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-}
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
-
-  int n = l->num_inner_lattice_sites;
-  int sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1);
-  int nc_size = SQUARE(l->num_lattice_site_var);
-  int n1, n2;
-  if ( l->depth > 0 ) {
-    n1 = l->num_lattice_sites;
-    n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites;
-  } else {
-    n1 = l->num_inner_lattice_sites;
-    n2 = l->num_inner_lattice_sites;
-  }
-    
-  START_LOCKED_MASTER(threading)
-  if( op->D_vectorized == NULL ) {
-    int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    // 2 is for complex, 4 is for 4 directions
-    MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n2, 64 );
-    MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n2, 64 );
-    MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->num_lattice_site_var*column_offset*n, 64 );
-  }
-  END_LOCKED_MASTER(threading)
-
-  int start, end;
-  compute_core_start_end_custom(0, n, &start, &end, l, threading, 1);
-  int n_per_core = end-start;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int offset_v = 2*l->num_lattice_site_var*column_offset;
-  copy_coarse_operator_to_vectorized_layout_PRECISION(
-      op->D + 4*start*nc_size,
-      op->D_vectorized + 4*start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-  copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
-      op->D + 4*start*nc_size,
-      op->D_transformed_vectorized + 4*start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-  copy_coarse_operator_clover_to_vectorized_layout_PRECISION(
-      op->clover + start*sc_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-#ifdef HAVE_TM
-  int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-  add_tm_term_to_vectorized_layout_PRECISION(
-      op->tm_term + start*tm_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-#endif
-  SYNC_CORES(threading)
-  
-  // vectorize negative boundary
-  if ( l->depth > 0 ) {
-    compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1);
-    n_per_core = end-start;
-    copy_coarse_operator_to_vectorized_layout_PRECISION(
-        op->D + 4*start*nc_size,
-        op->D_vectorized + 4*start*offset_v,
-        n_per_core, l->num_lattice_site_var/2);
-    copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(
-        op->D + 4*start*nc_size,
-        op->D_transformed_vectorized + 4*start*offset_v,
-        n_per_core, l->num_lattice_site_var/2);
-    SYNC_CORES(threading)
-  }
-}
-
-void coarse_operator_PRECISION_set_couplings_clover( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) {
-    
-  if(op->D_vectorized == 0) 
-    coarse_operator_PRECISION_set_couplings(op, l, threading);
-  
-  int n = l->num_inner_lattice_sites;
-  int sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1);
-  int start, end;
-
-  compute_core_start_end_custom(0, n, &start, &end, l, threading, 1);
-  int n_per_core = end-start;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int offset_v = 2*l->num_lattice_site_var*column_offset;
-
-  copy_coarse_operator_clover_to_vectorized_layout_PRECISION(
-      op->clover + start*sc_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-#ifdef HAVE_TM
-  int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1);
-  add_tm_term_to_vectorized_layout_PRECISION( 
-      op->tm_term + start*tm_size,
-      op->clover_vectorized + start*offset_v,
-      n_per_core, l->num_lattice_site_var/2);
-#endif
-
-}
-#endif
-
-
-void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
-    complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
-
-  sse_set_coarse_self_coupling_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp );
-}
-
-
-void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) {
-
-  int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg,
-      num_eig_vect = l->next_level->num_lattice_site_var/2,
-      aggregate_size = l->inner_vector_size / num_aggregates,
-      clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2;
-  int t1, t2;
-
-  config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover;
-
-  // just an abbreviation
-  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-  int fine_components = l->num_lattice_site_var;
-
-  int aggregate = (fine_components*site)/aggregate_size;
-  clover_pt = clover + aggregate*clover_site_size;
-
-  // U(x) = [ A B      , A=A*, D=D*, C = -B*
-  //          C D ]
-  // storage order: upper triangle of A, upper triangle of D, B, columnwise
-  // diagonal coupling
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    // index k used for vectorization
-    for ( k=0; k<=n; k++ ) {
-
-      k1 = (n*(n+1))/2;
-      k2 = (n*(n+1))/2+(num_eig_vect*(num_eig_vect+1))/2;
-      t1 = (n+0*num_eig_vect)*component_offset;
-      t2 = (n+1*num_eig_vect)*component_offset;
-
-      // A
-      clover_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // D
-      clover_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-    }
-
-    // index k used for vectorization
-    for ( k=0; k<num_eig_vect; k++ ) {
-
-      k1 = num_eig_vect*(num_eig_vect+1+n);
-      t1 = (n+2*num_eig_vect)*component_offset;
-
-      // B
-      clover_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-    }
-  }
-}
-
-
-void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, 
-                                                        complex_PRECISION *V, const int mu, level_struct *l, int site,
-                                                        const int n_rhs, complex_PRECISION *tmp ) {
-
-  sse_set_coarse_neighbor_coupling_PRECISION( spin_0_1, spin_2_3, V, mu, l, site, n_rhs, tmp );
-}
-
-
-void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site,
-                                                                 const int n_rhs, complex_PRECISION *tmp ) {
-
-  int k, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2,
-      D_link_size = num_eig_vect*num_eig_vect*4;
-  int t1, t2;
-
-  config_PRECISION D_pt, D = l->next_level->op_PRECISION.D;
-
-  // just an abbreviation
-  int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-  int fine_components = l->num_lattice_site_var;
-
-  int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg);
-  D_pt = D + (4*aggregate+mu)*D_link_size;
-
-  // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-  //             C D ]                        -B*  D* ]
-  // storage order: A, C, B, D, each column wise
-  for ( int n=0; n<n_rhs; n++ ) {
-
-    // index k used for vectorization
-    for ( k=0; k<num_eig_vect; k++ ) {
-
-      k1 = (n+0*num_eig_vect)*num_eig_vect;
-      k2 = (n+1*num_eig_vect)*num_eig_vect;
-      t1 = (n+0*num_eig_vect)*component_offset;
-      t2 = (n+1*num_eig_vect)*component_offset;
-
-      
-      // A
-      D_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // C
-      D_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-
-
-      k1 = (n+2*num_eig_vect)*num_eig_vect;
-      k2 = (n+3*num_eig_vect)*num_eig_vect;
-      t1 = (n+2*num_eig_vect)*component_offset;
-      t2 = (n+3*num_eig_vect)*component_offset;
-
-      // B
-      D_pt[ k1+k ] += ((float *)(tmp+t1))[k] + I * ((float *)(tmp+t1)+component_offset)[k];
-
-      // D
-      D_pt[ k2+k ] += ((float *)(tmp+t2))[k] + I * ((float *)(tmp+t2)+component_offset)[k];
-    }
-  }
-}
-
-
-void copy_coarse_operator_to_vectorized_layout_PRECISION( config_PRECISION D,                                              
-                                                          OPERATOR_TYPE_PRECISION *D_vectorized, 
-                                                          int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout D is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in D
-  int block_offset = vecs*vecs;
-
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-  PRECISION *out_tmp = D_vectorized;
-#else
-  // 2 is for complex
-  PRECISION out_tmp[2*column_offset*2*vecs];
-#endif
-
-  // we zero out the padded area to avoid potential floating-point errors
-  // D_vectorized is
-  // AB
-  // CD
-  // 00
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<4*num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        out_tmp[(2*i+0)*column_offset + j] = creal(D[0*block_offset + i*vecs+j]);
-        out_tmp[(2*i+1)*column_offset + j] = cimag(D[0*block_offset + i*vecs+j]);
-        // C
-        out_tmp[(2*i+0)*column_offset + j + vecs] = creal(D[1*block_offset + i*vecs+j]);
-        out_tmp[(2*i+1)*column_offset + j + vecs] = cimag(D[1*block_offset + i*vecs+j]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*i+0)*column_offset + j] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j] = 0.0;
-      }
-    }
-
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // B
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = creal(D[2*block_offset + i*vecs+j]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(D[2*block_offset + i*vecs+j]);
-        // D
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(D[3*block_offset + i*vecs+j]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = cimag(D[3*block_offset + i*vecs+j]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
-      }
-    }
-    D += 2*vecs*2*vecs;
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-#else
-    convert_PRECISION_to_half(2*column_offset*2*vecs, out_tmp, D_vectorized);
-    D_vectorized += 2*column_offset*2*vecs;
-#endif
-  }
-}
-
-
-void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D,
-                                                                     OPERATOR_TYPE_PRECISION *D_vectorized,
-                                                                     int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout D is stored column wise, but not split into ABCD
-  // output is transposed
-  // each column is padded, such that the next column can also start at 64bit boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in D
-  int block_offset = vecs*vecs;
-
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-  PRECISION *out_tmp = D_vectorized;
-#else
-  // 2 is for complex
-  PRECISION out_tmp[2*column_offset*2*vecs];
-#endif
-
-  // we zero out the padded area to avoid potential floating-point errors
-  // D_vectorized is
-  // A^T C^T
-  // B^T D^T
-  //  0   0
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<4*num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        out_tmp[(2*i+0)*column_offset + j] = creal(D[0*block_offset + j*vecs+i]);
-        out_tmp[(2*i+1)*column_offset + j] = -cimag(D[0*block_offset + j*vecs+i]);
-        // B
-        out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(D[2*block_offset + j*vecs+i]);
-        out_tmp[(2*i+1)*column_offset + j + vecs] = cimag(D[2*block_offset + j*vecs+i]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*i+0)*column_offset + j] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j] = 0.0;
-      }
-    }
-
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // C
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = -creal(D[1*block_offset + j*vecs+i]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(D[1*block_offset + j*vecs+i]);
-        // D
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(D[3*block_offset + j*vecs+i]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = -cimag(D[3*block_offset + j*vecs+i]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
-      }
-    }
-    D += 2*vecs*2*vecs;
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-#else
-    convert_PRECISION_to_half(2*column_offset*2*vecs, out_tmp, D_vectorized);
-    D_vectorized += 2*column_offset*2*vecs;
-#endif
-  }
-}
-
-
-void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover,
-                                                                OPERATOR_TYPE_PRECISION *clover_vectorized,
-                                                                int num_aggregates, int num_eig_vect) {
-
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_D = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-  int offset_to_B = 2*offset_to_D; // B comes after A and D
-
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-  PRECISION *out_tmp = clover_vectorized;
-#else
-  // 2 is for complex
-  PRECISION out_tmp[2*column_offset*2*vecs];
-#endif
-
-  // we zero out the padded area to avoid potential floating-point errors
-  // cloverD_vectorized is
-  // AB
-  // CD
-  // 00
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]);
-        out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]);
-        // C = -B^dagger
-        out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]);
-        out_tmp[(2*i+1)*column_offset + j + vecs] =  cimag(clover[offset_to_B + j*vecs+i]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*i+0)*column_offset + j] = 0.0;
-        out_tmp[(2*i+1)*column_offset + j] = 0.0;
-      }
-    }
-
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // B
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = creal(clover[offset_to_B + i*vecs+j]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = cimag(clover[offset_to_B + i*vecs+j]);
-        // D
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]);
-      }
-      // zero
-      for(int j=2*vecs; j<column_offset; j++) {
-        out_tmp[(2*(i+vecs)+0)*column_offset + j] = 0.0;
-        out_tmp[(2*(i+vecs)+1)*column_offset + j] = 0.0;
-      }
-    }
-    clover += offset_to_B + vecs*vecs;
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-#else
-    convert_PRECISION_to_half(2*column_offset*2*vecs, out_tmp, clover_vectorized);
-    clover_vectorized += 2*column_offset*2*vecs;
-#endif
-  }
-}
-
-void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term,
-    OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect) {
-#ifdef HAVE_TM
-  int vecs = num_eig_vect;
-  // in vectorized layout clover is stored column wise, but not split into ABCD
-  // each column is padded, such that next column can also start at 64B boundary
-  int column_offset = SIMD_LENGTH_PRECISION*((2*num_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  // offset between blocks in clover
-  int offset_to_D = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal
-
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-  PRECISION *out_tmp = clover_vectorized;
-#else
-  // 2 is for complex
-  PRECISION out_tmp[2*column_offset*2*vecs];
-  for (int i=0; i<2*column_offset*2*vecs; i++)
-    out_tmp=0;
-#endif
-
-  // we zero out the padded area to avoid potential floating-point errors
-  // cloverD_vectorized is
-  // AB
-  // CD
-  // 00
-  // (column wise, size of zeros such that columns length is multiple of 64B)
-
-  // 4 directions
-  for ( int a=0; a<num_aggregates; a++ ) {
-    for(int i=0; i<vecs; i++) {
-      for(int j=0; j<vecs; j++) {
-        // A
-        // primed indices to transpose input when necessary to get lower triangle of output
-        int ip = i;
-        int jp = j;
-        PRECISION sign = 1.0;
-        if(j > i) {
-          ip = j;
-          jp = i;
-          sign = -1.0;
-        }
-        int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal
-        out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]);
-        out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]);
-	out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_D + offset_to_column+jp]);
-        out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_D + offset_to_column+jp]);
-      }
-    }
-
-    tm_term += 2*offset_to_D;
-#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16
-    // out_tmp is an alias for the actual output
-    out_tmp += 2*column_offset*2*vecs;
-#else
-    //TODO
-    error0("STORE_COARSE_OPERATOR_AS_FLOAT16 not implemented for HAVE_TM")
-    convert_PRECISION_to_half(2*column_offset*2*vecs, out_tmp, clover_vectorized);
-    clover_vectorized += 2*column_offset*2*vecs;
-#endif
-  }
-#endif
-}
-
-void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
-                                                           complex_PRECISION *phi, schwarz_PRECISION_struct *s,
-                                                           level_struct *l, int site, int *direction_flags ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = l->num_lattice_site_var*offset;
-  int index_bw;
-  int index_fw;
-  int *neighbor = s->op.neighbor_table;
-  int *backward_neighbor = s->op.backward_neighbor_table;
-  complex_PRECISION *phi_pt;
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D;
-  int n = l->num_lattice_site_var;
-  int D_site_offset = 4*n*n;
-  int D_link_offset = n*n;
-  int clover_offset = (n*(n+1))/2*site;
-
-  coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l );
-
-  for(int mu=0; mu<4; mu++) {
-    index_fw  = neighbor[5*site+1 + mu];
-    index_bw  = backward_neighbor[5*site+1 + mu];
-
-    // from backward
-    if ( direction_flags[2*mu+0] == 1 ) {
-      D_pt = D + D_site_offset*index_bw + D_link_offset*mu;
-      phi_pt = phi + site_offset*index_bw;
-      coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
-    }
-
-    // from forward
-    if ( direction_flags[2*mu+1] == 1 ) {
-      D_pt = D + D_site_offset*site + D_link_offset*mu;
-      phi_pt = phi + site_offset*index_fw;
-      coarse_spinwise_n_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
-    }
-  }
-}
-
-
-void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, 
-                                                               complex_PRECISION *phi, const int mu,
-                                                               schwarz_PRECISION_struct *s, level_struct *l, int site ) {
-
-  int offset = SIMD_LENGTH_PRECISION;
-  int site_offset = l->num_lattice_site_var*offset;
-  int index_fw;
-  int *neighbor = s->op.neighbor_table;
-  complex_PRECISION *phi_pt;
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D;
-  int n = l->num_lattice_site_var;
-  int D_site_offset = 4*n*n;
-  int D_link_offset = n*n;
-
-  vector_PRECISION_define( eta1, 0, 0, n*offset, l );
-  vector_PRECISION_define( eta2, 0, 0, n*offset, l );
-
-  // requires the positive boundaries of phi to be communicated before
-  index_fw  = neighbor[5*site+1 + mu];
-  D_pt = D + D_site_offset*site + D_link_offset*mu;
-  phi_pt = phi + site_offset*index_fw;
-  coarse_spinwise_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l );
-}
-
-
-void coarse_spinwise_site_self_couplings_PRECISION_vectorized(
-    complex_PRECISION *eta1, complex_PRECISION *eta2,
-    complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) {
-  
-  sse_coarse_spinwise_site_self_couplings_PRECISION( eta1, eta2, phi, clover, elements, l );
-}
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, 
-                                      level_struct *l, struct Thread *threading ) {
-  
-  START_UNTHREADED_FUNCTION(threading)
-
-  int n = s->num_block_sites, *length = s->dir_length, **index = s->index,
-      *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset;
-  
-  // site-wise self coupling
-  coarse_self_couplings_PRECISION_vectorized( eta, phi, s->op.clover_vectorized, (start/m), (start/m)+n, l );
-
-  // inner block couplings
-  for ( int mu=0; mu<4; mu++ ) {
-    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized +
-      (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset;
-    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized +
-      (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset;
-    ind = index[mu]; // mu direction
-    for ( int i=0; i<length[mu]; i++ ) {
-      int k = ind[i]; int j = neighbor[5*k+mu+1];
-      // hopp
-      coarse_hopp_PRECISION_vectorized( leta+m*k, lphi+m*j, Dplus + 4*vectorized_link_offset*k, l );
-      // daggered hopp
-      coarse_hopp_PRECISION_vectorized( leta+m*j, lphi+m*k, Dminus + 4*vectorized_link_offset*k, l );
-    }
-  }
-
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
-                                      level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _SC, threading );
-  SYNC_CORES(threading)
-  int start;
-  int end;
-  compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &end, l, threading, 1);
-  coarse_self_couplings_PRECISION_vectorized( eta, phi, op->clover_vectorized, start, end, l );
-  SYNC_CORES(threading)
-  PROF_PRECISION_STOP( _SC, 1, threading );
-  PROF_PRECISION_START( _NC, threading );
-  coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading );
-  PROF_PRECISION_STOP( _NC, 1, threading );
-}
-#endif
-
-#endif // SSE
diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h
deleted file mode 100644
index b805944..0000000
--- a/src/sse_coarse_operator_generic.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_COARSE_OPERATOR_PRECISION_HEADER
-  #define SSE_COARSE_OPERATOR_PRECISION_HEADER
-
-  #ifdef SSE
-  
-  #include "blas_vectorized.h"
-  
-  void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading );
-  void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3,
-      complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  // here we do not check whether site is really on boundary, caller is responsible for that
-  // tmp is used to store coarse operator with padding, until sum over all sites has been done
-  void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *buffer1, complex_PRECISION *buffer2,
-      complex_PRECISION *V, const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-  void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp );
-
-  void copy_coarse_operator_to_vectorized_layout_PRECISION(config_PRECISION D,
-      OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect);
-  // fw and bw links have a symmetry that allows constructing one from another, see, e.g., coarse_hopp_PRECISION
-  // for vectorization we store the operator for both cases, the "daggered" links need this transformed layout
-  void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D,
-      OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect);
-  void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-  void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term,
-      OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect);
-    
-  void coarse_spinwise_site_self_couplings_PRECISION_vectorized(
-      complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l );
-  
-  void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
-      int site, int *direction_flags );  
-  
-  void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l,
-      int site );  
-  
-  
-  static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi,
-      OPERATOR_TYPE_PRECISION *D, level_struct *l ) {
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-    int lda = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    cgenmv(l->num_lattice_site_var, D, lda, (float *)phi, (float *)eta);
-#endif
-  }
-  static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi,
-      OPERATOR_TYPE_PRECISION *D, level_struct *l ) {
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-    int lda = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-    cgemv(l->num_lattice_site_var, D, lda, (float *)phi, (float *)eta);
-#endif
-  }
-
-  static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, 
-                                                                 OPERATOR_TYPE_PRECISION *clover, int start, int end, level_struct *l ) {
-#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION
-    int site_size = l->num_lattice_site_var;
-    int lda = SIMD_LENGTH_PRECISION*((site_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-
-    for(int i=start; i<end; i++) {
-      for(int j=0; j<site_size; j++)
-        eta[i*site_size+j] = 0.0;
-      cgemv(site_size, clover+i*2*site_size*lda, lda, (float *)(phi+i*site_size), (float *)(eta+i*site_size));
-    }
-#endif
-  }
-  
-
-  static inline void coarse_spinwise_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
-
-#ifdef SSE
-    int num_eig_vect = l->num_lattice_site_var/2;
-    int num_eig_vect2 = num_eig_vect*num_eig_vect;
-    complex_PRECISION *eta[2] = {eta1, eta2};
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-
-    __m128 D_re;
-    __m128 D_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-    for(int s=0; s<2; s++) {
-      // t is the row of the input matrix (in 2x2 block form)
-      for(int t=0; t<2; t++) {
-        for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-          for(int column=0; column<num_eig_vect; column++) {
-            in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-            in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-            for(int row=0; row<num_eig_vect; row++) {
-              out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-              out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-              D_re = _mm_set1_ps(creal(D[column*num_eig_vect+row]));
-              D_im = _mm_set1_ps(cimag(D[column*num_eig_vect+row]));
-
-              cfmadd(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-              _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-              _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-            }
-          }
-        }
-        eta[s] += num_eig_vect*elements;
-        D += num_eig_vect2;
-      }
-      phi += num_eig_vect*elements;
-    }
-#endif
-  }  
-  
-  
-  static inline void coarse_spinwise_n_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
-
-#ifdef SSE
-    int num_eig_vect = l->num_lattice_site_var/2;
-    int num_eig_vect2 = num_eig_vect*num_eig_vect;
-    complex_PRECISION *eta[2] = {eta1, eta2};
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
-
-    __m128 D_re;
-    __m128 D_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2)
-    for(int s=0; s<2; s++) {
-      // t is the row of the input matrix (in 2x2 block form)
-      for(int t=0; t<2; t++) {
-        for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-          for(int column=0; column<num_eig_vect; column++) {
-            in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-            in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-            for(int row=0; row<num_eig_vect; row++) {
-              out_re = _mm_load_ps((float *)eta[s] + i + (2*row+0)*elements);
-              out_im = _mm_load_ps((float *)eta[s] + i + (2*row+1)*elements);
-              D_re = _mm_set1_ps(creal(D[column*num_eig_vect+row]));
-              D_im = _mm_set1_ps(cimag(D[column*num_eig_vect+row]));
-
-              cfnmadd(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-              _mm_store_ps((float *)eta[s] + i + (2*row+0)*elements, out_re);
-              _mm_store_ps((float *)eta[s] + i + (2*row+1)*elements, out_im);
-            }
-          }
-        }
-        eta[s] += num_eig_vect*elements;
-        D += num_eig_vect2;
-      }
-      phi += num_eig_vect*elements;
-    }
-#endif
-  }
-  
-
-  static inline void coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
-      complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l ) {
-
-#ifdef SSE
-    int num_eig_vect = l->num_lattice_site_var/2;
-    int num_eig_vect2 = num_eig_vect*num_eig_vect;
-    complex_PRECISION *eta[2] = {eta1, eta2};
-    // U_mu(x) = [ A B      , U_-mu(x+muhat) = [ A* -C*
-    //             C D ]                        -B*  D* ]
-    // storage order: A, C, B, D
-    // note: minus sign of D = self_coupling - hopping_term is added here
-
-    __m128 D_re;
-    __m128 D_im;
-    __m128 in_re;
-    __m128 in_im;
-    __m128 out_re;
-    __m128 out_im;
-    // A*
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[0] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[0] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfnmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[0] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[0] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    // -C*
-    phi += num_eig_vect*elements;
-    D += num_eig_vect2;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[1] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[1] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[1] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[1] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    // -B*
-    eta[0] += num_eig_vect*elements;
-    phi -= num_eig_vect*elements;
-    D += num_eig_vect2;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[0] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[0] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[0] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[0] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-    // D*
-    eta[1] += num_eig_vect*elements;
-    phi += num_eig_vect*elements;
-    D += num_eig_vect2;
-    for(int i=0; i<elements; i+=SIMD_LENGTH_PRECISION) {
-      for(int column=0; column<num_eig_vect; column++) {
-        in_re  = _mm_load_ps((float *)phi + i + (2*column+0)*elements);
-        in_im  = _mm_load_ps((float *)phi + i + (2*column+1)*elements);
-        for(int row=0; row<num_eig_vect; row++) {
-          out_re = _mm_load_ps((float *)eta[1] + i + (2*row+0)*elements);
-          out_im = _mm_load_ps((float *)eta[1] + i + (2*row+1)*elements);
-          // load transpose
-          D_re = _mm_set1_ps(creal(D[row*num_eig_vect+column]));
-          D_im = _mm_set1_ps(cimag(D[row*num_eig_vect+column]));
-
-          cfnmadd_conj(D_re, D_im, in_re, in_im, &out_re, &out_im);
-
-          _mm_store_ps((float *)eta[1] + i + (2*row+0)*elements, out_re);
-          _mm_store_ps((float *)eta[1] + i + (2*row+1)*elements, out_im);
-        }
-      }
-    }
-#endif
-  }
-  
-#endif
-#endif
diff --git a/src/sse_dirac.c b/src/sse_dirac.c
index de9a4fd..fb7923c 100644
--- a/src/sse_dirac.c
+++ b/src/sse_dirac.c
@@ -26,6 +26,10 @@
 #define index_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + gamma_offset[mu][spin] ]
 #define index_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] - gamma_offset[mu][spin] +1 ]
 
+// 12*(gamma_co[mu][spin]/2) equivalent to #define flav_gamma(k) ((k)>1?((k)*3+6):((k)*3))
+#define index_d_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) + gamma_offset[mu][spin] ]
+#define index_d_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) - gamma_offset[mu][spin] +1 ]
+
 #define neighbor_coupling_file "sse_dirac_su3local.h"
 
 void prp_double( complex_double *prn[4], complex_double *phi, int start, int end ) {   
@@ -129,6 +133,137 @@ void prp_float( complex_float *prn[4], complex_float *phi, int start, int end )
 }
 
 
+void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end ) {   
+  
+  double *phi_pt = (double*)(phi+start);
+  double *phi_end = (double*)(phi+end);
+  double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)};
+  
+  while ( phi_pt < phi_end ) {
+    
+    __m128d phi_pt1_re; __m128d phi_pt1_im;
+    
+    sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im );
+    for ( int mu=0; mu<4; mu++) {
+      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0) );
+      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0) );
+      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
+      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
+      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
+      pr[mu] += 2*SIMD_LENGTH_double;
+    }
+    
+    sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im );
+    for ( int mu=0; mu<4; mu++) {
+      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) );
+      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) );
+      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
+      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
+      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
+      pr[mu] += 2*SIMD_LENGTH_double;
+    }
+    
+    sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im );
+    for ( int mu=0; mu<4; mu++) {
+      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1) );
+      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1) );
+      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
+      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
+      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
+      pr[mu] += 2*SIMD_LENGTH_double;
+    }
+
+    sse_complex_deinterleaved_load_pd( phi_pt+12, &phi_pt1_re, &phi_pt1_im );
+    for ( int mu=0; mu<4; mu++) {
+      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) );
+      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) );
+      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
+      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
+      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
+      pr[mu] += 2*SIMD_LENGTH_double;
+    }
+    
+    sse_complex_deinterleaved_load_pd( phi_pt+16, &phi_pt1_re, &phi_pt1_im );
+    for ( int mu=0; mu<4; mu++) {
+      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1) );
+      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1) );
+      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
+      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
+      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
+      pr[mu] += 2*SIMD_LENGTH_double;
+    }
+    
+    sse_complex_deinterleaved_load_pd( phi_pt+20, &phi_pt1_re, &phi_pt1_im );
+    for ( int mu=0; mu<4; mu++) {
+      __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) );
+      __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) );
+      __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re );
+      __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im );
+      sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] );
+      pr[mu] += 2*SIMD_LENGTH_double;
+    }
+
+    phi_pt += 48;
+  }
+}
+
+
+void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { 
+  
+  float *phi_pt = (float*)(phi+start);
+  float *phi_end = (float*)(phi+end);
+  float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)};
+  
+  while ( phi_pt < phi_end ) {
+    
+    __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] );
+    __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] );
+    for ( int mu=0; mu<4; mu++) {
+      __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0),
+                                       index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) );
+      __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0),
+                                       index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) );
+      
+      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
+      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
+      
+      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
+      pr[mu] += 8;
+    }
+    
+    phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[12], phi_pt[14] );
+    phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[13], phi_pt[15] );
+    for ( int mu=0; mu<4; mu++) {
+      __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1),
+                                       index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) );
+      __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1),
+                                       index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) );
+      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
+      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
+      
+      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
+      pr[mu] += 8;
+    }
+    
+    phi_pt1_re = _mm_setr_ps( phi_pt[16], phi_pt[18], phi_pt[20], phi_pt[22] );
+    phi_pt1_im = _mm_setr_ps( phi_pt[17], phi_pt[19], phi_pt[21], phi_pt[23] );
+    for ( int mu=0; mu<4; mu++) {
+      __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1),
+                                       index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) );
+      __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1),
+                                       index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) );
+      __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re );
+      __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im );
+      
+      sse_complex_interleaved_store( res_re, res_im, pr[mu] );
+      pr[mu] += 8;
+    }
+    
+    phi_pt+=48;
+  }
+}
+
+
 void prn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) {
   
   double *phi_pt = (double*)(phi+start);
@@ -305,6 +440,267 @@ void prn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_st
 }
 
 
+void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) {
+  
+  double *phi_pt = (double*)(phi+start);
+  double *phi_end_pt = (double*)(phi+end);
+  double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
+  double *D_pt = ((double*)(op->D))+2*(start/24*36);
+  int *nb_pt = neighbor+((start/24)*4);
+  
+  while ( phi_pt < phi_end_pt ) {
+    
+    __m128d in_re[6];
+    __m128d in_im[6];
+    
+    for ( int i=0; i<3; i++ ) {
+      in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] );
+      in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); 
+    }
+    for ( int i=3; i<6; i++ ) {
+      in_re[i] = _mm_setr_pd( phi_pt[2*i+6], phi_pt[2*i+12] );
+      in_im[i] = _mm_setr_pd( phi_pt[2*i+7], phi_pt[2*i+13] ); 
+    }
+    
+    for ( int mu=0; mu<4; mu++ ) {
+      
+      __m128d v_re[6];
+      __m128d v_im[6];
+      
+      // calc spin projection
+      for ( int i=0; i<3; i++ )  {
+        v_re[i] = _mm_setr_pd( index_d_re(phi_pt+2*i,mu,0), index_d_re(phi_pt+2*i,mu,1) );
+        v_im[i] = _mm_setr_pd( index_d_im(phi_pt+2*i,mu,0), index_d_im(phi_pt+2*i,mu,1) );
+        v_re[i] = _mm_add_pd( in_re[i], v_re[i] );
+        v_im[i] = _mm_add_pd( in_im[i], v_im[i] );
+      }
+      for ( int i=3; i<6; i++ )  {
+        v_re[i] = _mm_setr_pd( index_d_re(phi_pt+6+2*i,mu,0), index_d_re(phi_pt+6+2*i,mu,1) );
+        v_im[i] = _mm_setr_pd( index_d_im(phi_pt+6+2*i,mu,0), index_d_im(phi_pt+6+2*i,mu,1) );
+        v_re[i] = _mm_add_pd( in_re[i], v_re[i] );
+        v_im[i] = _mm_add_pd( in_im[i], v_im[i] );
+      }
+
+      {
+        __m128d res_re[6];
+        __m128d res_im[6];
+        // load su(3) matrix and multiply
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] );
+          __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] );
+          cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          cmul_conj_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
+          buf_re = _mm_set1_pd( D_pt[6+2*i] );
+          buf_im = _mm_set1_pd( D_pt[7+2*i] );
+          cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          cfmadd_conj_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
+          buf_re = _mm_set1_pd( D_pt[12+2*i] );
+          buf_im = _mm_set1_pd( D_pt[13+2*i] );
+          cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+          cfmadd_conj_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
+        }
+        
+        {
+          double *pr_pt = pr[mu]+2*12*(*(nb_pt));
+          for ( int i=0; i<3; i++ ) {
+            __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] );
+            __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] );
+            _mm_storeu_pd( pr_pt+0+2*i, out1 );
+            _mm_storeu_pd( pr_pt+6+2*i, out2 );
+          }
+          for ( int i=3; i<6; i++ ) {
+            __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] );
+            __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] );
+            _mm_storeu_pd( pr_pt+ 6+2*i, out1 );
+            _mm_storeu_pd( pr_pt+12+2*i, out2 );
+          }
+        }
+      }
+      
+      D_pt += 18;
+      nb_pt++;
+    }
+    
+    phi_pt += 24*2;
+  }
+  
+}
+
+
+void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { 
+  
+  float *phi_pt = (float*)(phi+start);
+  float *phi_end_pt = (float*)(phi+end);
+  float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])};
+  float *D_pt = (float*)(op->D_transformed_vectorized+2*(start/24*48));
+  int *nb_pt = neighbor+((start/24)*4);
+  
+  while ( phi_pt < phi_end_pt ) { 
+    
+    __m128 in11[2];
+    __m128 in21[2];
+    __m128 in12[2];
+    __m128 in22[2];
+
+    in11[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 );
+    in11[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 );
+    in21[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 );
+    in21[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 );
+    in12[0] = _mm_setr_ps( phi_pt[12], phi_pt[14], phi_pt[16], 0 );
+    in12[1] = _mm_setr_ps( phi_pt[13], phi_pt[15], phi_pt[17], 0 );
+    in22[0] = _mm_setr_ps( phi_pt[18], phi_pt[20], phi_pt[22], 0 );
+    in22[1] = _mm_setr_ps( phi_pt[19], phi_pt[21], phi_pt[23], 0 );
+    
+    for ( int mu=0; mu<4; mu++ ) {
+      __m128 res11[2];
+      __m128 res21[2];
+      __m128 res12[2];
+      __m128 res22[2];
+      
+      {
+        // calc spin0 projection
+        res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), index_d_re(phi_pt+4,mu,0), 0 );
+        res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), index_d_im(phi_pt+4,mu,0), 0 );
+        __m128 in11_re = _mm_add_ps( in11[0], res11[0] );
+        __m128 in11_im = _mm_add_ps( in11[1], res11[1] );
+        
+        // calc spin1 projection
+        res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,1), index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), 0 );
+        res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,1), index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), 0 );   
+        __m128 in21_re = _mm_add_ps( in21[0], res11[0] );
+        __m128 in21_im = _mm_add_ps( in21[1], res11[1] );
+
+        // calc spin0 projection
+        res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0), index_d_re(phi_pt+16,mu,0), 0 );
+        res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0), index_d_im(phi_pt+16,mu,0), 0 );
+        __m128 in12_re = _mm_add_ps( in12[0], res12[0] );
+        __m128 in12_im = _mm_add_ps( in12[1], res12[1] );
+        
+        // calc spin1 projection
+        res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,1), index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1), 0 );
+        res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,1), index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1), 0 );   
+        __m128 in22_re = _mm_add_ps( in22[0], res12[0] );
+        __m128 in22_im = _mm_add_ps( in22[1], res12[1] );
+
+        // load 1st part of su(3) matrix and multiply
+        {
+          __m128 buf1 = _mm_loadu_ps( D_pt );
+          __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float );
+          {
+            __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) );
+            __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) );
+            cmul_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) );
+            __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) );
+            cmul_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) );
+            __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) );
+            cmul_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) );
+            __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) );
+            cmul_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
+          }
+        }
+        // load 2nd part of su(3) matrix and multiply
+        {
+          __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float );
+          __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float );
+          {
+            __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) );
+            __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) );
+            __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) );
+            __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) );
+            __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
+          }
+        }
+        // load 3rd part of su(3) matrix and multiply
+        {
+          __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float );
+          __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float );
+          {
+            __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) );
+            __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) );
+            __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) );
+            __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
+          }
+          {
+            __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) );
+            __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) );
+            cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
+          }
+        }
+      }
+
+      float *pr_pt = pr[mu]+2*12*(*nb_pt);
+      {
+        __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] );
+        __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] );
+        __m128 buf3 = _mm_unpacklo_ps( res21[0], res21[1] );
+          
+        {
+          __m128 buf4 = _mm_unpackhi_ps( res21[0], res21[1] );
+          buf2 = _mm_movelh_ps( buf2, buf3 );
+          buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+        }
+        {
+          _mm_storeu_ps( pr_pt, buf1 );
+          _mm_storeu_ps( pr_pt+4, buf2 );
+          _mm_storeu_ps( pr_pt+8, buf3 );
+        }
+      }
+      {
+        __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] );
+        __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] );
+        __m128 buf3 = _mm_unpacklo_ps( res22[0], res22[1] );
+        
+        {
+          __m128 buf4 = _mm_unpackhi_ps( res22[0], res22[1] );
+          buf2 = _mm_movelh_ps( buf2, buf3 );
+          buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+        }
+        {
+          _mm_storeu_ps( pr_pt+12, buf1 );
+          _mm_storeu_ps( pr_pt+16, buf2 );
+          _mm_storeu_ps( pr_pt+20, buf3 );
+        }
+      }
+      nb_pt++;
+      D_pt += 24;
+    }
+    
+    phi_pt += 48;
+  }
+}
+
+
 void pbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) {
   
   double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
@@ -483,15 +879,283 @@ void pbn_float( complex_float *eta, complex_float *prp[4], int start, int end )
   }
 }
 
-
-
-void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op,
-                     int *neighbor, int start, int end ) {  
+void dpbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) {
   
-  double *D_pt = ((double*)(op->D))+2*(start*3);
+  double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])};
   double *eta_pt = (double*)(eta+start);
-  double *eta_end_pt = (double*)(eta+end);
-  double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])};
+  
+  __m128d gamma0[4];
+  __m128d gamma1[4];
+  
+  for ( int mu=0; mu<4; mu++ ) {
+    gamma0[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][0]], gamma_im_sign[mu][gamma_co[mu][0]] );
+    gamma1[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][1]], gamma_im_sign[mu][gamma_co[mu][1]] );
+  }
+  
+  for ( int i=start; i<end; i+=24 ) {
+    
+    __m128d res[24];
+    for ( int j=0; j<24; j++ ) {
+      res[j] = _mm_loadu_pd( eta_pt + 2*j );
+    }
+
+    __m128d in[12];
+    // mu = T 
+    for ( int j=0; j<12; j++ ) {
+      in[j] = _mm_loadu_pd( pr[T] + i + 2*j );
+      res[j] = _mm_sub_pd( res[j], in[j] );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[j]) );
+      res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[j+3]) );
+      res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[j+6]) );
+      res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[j+9]) );
+      res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+j], buf1 );
+    }
+    // ---------------
+    // mu = Z 
+    for ( int j=0; j<12; j++ ) {
+      in[j] = _mm_loadu_pd( pr[Z] + i + 2*j );
+      res[j] = _mm_sub_pd( res[j], in[j] );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[j]) );
+      res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[j+3]) );
+      res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[j+6]) );
+      res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[j+9]) );
+      res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+j], buf1 );
+    }
+    // ---------------
+    // mu = Y 
+    for ( int j=0; j<12; j++ ) {
+      in[j] = _mm_loadu_pd( pr[Y] + i + 2*j );
+      res[j] = _mm_sub_pd( res[j], in[j] );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[j]) );
+      res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[j+3]) );
+      res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[j+6]) );
+      res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[j+9]) );
+      res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+j], buf1 );
+    }
+    // ---------------
+    // mu = X 
+    for ( int j=0; j<12; j++ ) {
+      in[j] = _mm_loadu_pd( pr[X] + i + 2*j );
+      res[j] = _mm_sub_pd( res[j], in[j] );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[j]) );
+      res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+j] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[j+3]) );
+      res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+j] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[j+6]) );
+      res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+j], buf1 );
+    }
+    for ( int j=0; j<3; j++ ) {
+      __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[j+9]) );
+      res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+j] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+j], buf1 );
+    }
+    // ---------------
+    
+    for ( int j=0; j<24; j++ ) {
+      _mm_storeu_pd( eta_pt + 2*j, res[j] );
+    }
+    eta_pt += 48;
+  }
+}
+
+ 
+void dpbn_float( complex_float *eta, complex_float *prp[4], int start, int end ) {
+  
+  float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])};
+  float *eta_pt = (float*)(eta+start);
+  
+  __m128 gamma0[4][2];
+  __m128 gamma1[4][2];
+  
+  for ( int mu=0; mu<4; mu++ ) {
+    gamma0[mu][0] = _mm_set1_ps( gamma_re_sign[mu][gamma_co[mu][0]] );
+    gamma0[mu][1] = _mm_set1_ps( gamma_im_sign[mu][gamma_co[mu][0]] );
+    gamma1[mu][0] = _mm_set1_ps( gamma_re_sign[mu][gamma_co[mu][1]] );
+    gamma1[mu][1] = _mm_set1_ps( gamma_im_sign[mu][gamma_co[mu][1]] );
+  }
+  
+  for ( int i=start; i<end; i+=24 ) {
+    
+    __m128 eta_lo11 = _mm_loadu_ps( eta_pt + 0 );
+    __m128 eta_lo21 = _mm_loadu_ps( eta_pt + 4 );
+    __m128 eta_hi11 = _mm_loadu_ps( eta_pt + 6 );
+    __m128 eta_hi21 = _mm_loadu_ps( eta_pt + 10 );
+    __m128 eta_lo12 = _mm_loadu_ps( eta_pt + 12 );
+    __m128 eta_lo22 = _mm_loadu_ps( eta_pt + 16 );
+    __m128 eta_hi12 = _mm_loadu_ps( eta_pt + 18 );
+    __m128 eta_hi22 = _mm_loadu_ps( eta_pt + 22 );
+    
+    __m128 eta21_lo[2];
+    __m128 eta21_hi[2];
+    __m128 eta22_lo[2];
+    __m128 eta22_hi[2];
+    
+    eta21_lo[0] = _mm_loadu_ps( eta_pt + 24 );
+    eta21_hi[0] = _mm_loadu_ps( eta_pt + 26 );
+    eta21_lo[1] = _mm_loadu_ps( eta_pt + 30 );
+    eta21_hi[1] = _mm_loadu_ps( eta_pt + 32 ); 
+    eta22_lo[0] = _mm_loadu_ps( eta_pt + 36 );
+    eta22_hi[0] = _mm_loadu_ps( eta_pt + 38 );
+    eta22_lo[1] = _mm_loadu_ps( eta_pt + 42 );
+    eta22_hi[1] = _mm_loadu_ps( eta_pt + 44 );
+   
+    for ( int mu=0; mu<4; mu++ ) {
+      __m128 res11[2];
+      __m128 res21[2];
+      __m128 res12[2];
+      __m128 res22[2];
+      
+      res11[0] = _mm_setr_ps( *(pr[mu]+i+0), *(pr[mu]+i+2), *(pr[mu]+i+4), 0 );
+      res11[1] = _mm_setr_ps( *(pr[mu]+i+1), *(pr[mu]+i+3), *(pr[mu]+i+5), 0 );
+      
+      res21[0] = _mm_setr_ps( *(pr[mu]+i+6), *(pr[mu]+i+8), *(pr[mu]+i+10), 0 );
+      res21[1] = _mm_setr_ps( *(pr[mu]+i+7), *(pr[mu]+i+9), *(pr[mu]+i+11), 0 );
+
+      res12[0] = _mm_setr_ps( *(pr[mu]+i+12), *(pr[mu]+i+14), *(pr[mu]+i+16), 0 );
+      res12[1] = _mm_setr_ps( *(pr[mu]+i+13), *(pr[mu]+i+15), *(pr[mu]+i+17), 0 );
+      
+      res22[0] = _mm_setr_ps( *(pr[mu]+i+18), *(pr[mu]+i+20), *(pr[mu]+i+22), 0 );
+      res22[1] = _mm_setr_ps( *(pr[mu]+i+19), *(pr[mu]+i+21), *(pr[mu]+i+23), 0 );
+
+      {
+        // store spin0 contribution
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] );
+          __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] );
+          eta_lo11 = _mm_sub_ps( eta_lo11, buf1 );
+          eta_lo21 = _mm_sub_ps( eta_lo21, buf2 );
+        }
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] );
+          __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] );
+          eta_lo12 = _mm_sub_ps( eta_lo12, buf1 );
+          eta_lo22 = _mm_sub_ps( eta_lo22, buf2 );
+        }
+        
+        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+        {
+          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res11[gamma_offset[mu][0]] );
+          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res11[1-gamma_offset[mu][0]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta21_lo[gamma_co[mu][2]] = _mm_sub_ps( eta21_lo[gamma_co[mu][2]], buf3 );
+          eta21_hi[gamma_co[mu][2]] = _mm_sub_ps( eta21_hi[gamma_co[mu][2]], buf4 );
+        }
+        {
+          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res12[gamma_offset[mu][0]] );
+          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res12[1-gamma_offset[mu][0]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta22_lo[gamma_co[mu][2]] = _mm_sub_ps( eta22_lo[gamma_co[mu][2]], buf3 );
+          eta22_hi[gamma_co[mu][2]] = _mm_sub_ps( eta22_hi[gamma_co[mu][2]], buf4 );
+        }
+      }
+      {
+        // store spin1 contribution
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res21[0], res21[1] );
+          __m128 buf2 = _mm_unpackhi_ps( res21[0], res21[1] );
+          eta_hi11 = _mm_sub_ps( eta_hi11, buf1 );
+          eta_hi21 = _mm_sub_ps( eta_hi21, buf2 );
+        }
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res22[0], res22[1] );
+          __m128 buf2 = _mm_unpackhi_ps( res22[0], res22[1] );
+          eta_hi12 = _mm_sub_ps( eta_hi12, buf1 );
+          eta_hi22 = _mm_sub_ps( eta_hi22, buf2 );
+        }
+        
+        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+        {
+          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res21[gamma_offset[mu][1]] );
+          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res21[1-gamma_offset[mu][1]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta21_lo[gamma_co[mu][3]] = _mm_sub_ps( eta21_lo[gamma_co[mu][3]], buf3 );
+          eta21_hi[gamma_co[mu][3]] = _mm_sub_ps( eta21_hi[gamma_co[mu][3]], buf4 );
+        }
+        {
+          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res22[gamma_offset[mu][1]] );
+          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res22[1-gamma_offset[mu][1]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta22_lo[gamma_co[mu][3]] = _mm_sub_ps( eta22_lo[gamma_co[mu][3]], buf3 );
+          eta22_hi[gamma_co[mu][3]] = _mm_sub_ps( eta22_hi[gamma_co[mu][3]], buf4 );
+        }
+      }
+    }
+    
+    _mm_storeu_ps( eta_pt+0, eta_lo11 );
+    _mm_storeu_ps( eta_pt+4, eta_lo21 );
+    _mm_storeu_ps( eta_pt+6, eta_hi11 );
+    _mm_storeu_ps( eta_pt+10, eta_hi21 );
+    _mm_storeu_ps( eta_pt+12, eta_lo12 );
+    _mm_storeu_ps( eta_pt+16, eta_lo22 );
+    _mm_storeu_ps( eta_pt+18, eta_hi12 );
+    _mm_storeu_ps( eta_pt+22, eta_hi22 );
+    _mm_storeu_ps( eta_pt+24, eta21_lo[0] );
+    _mm_storeu_ps( eta_pt+26, eta21_hi[0] );
+    _mm_storeu_ps( eta_pt+30, eta21_lo[1] );
+    _mm_storeu_ps( eta_pt+32, eta21_hi[1] );
+    _mm_storeu_ps( eta_pt+36, eta22_lo[0] );
+    _mm_storeu_ps( eta_pt+38, eta22_hi[0] );
+    _mm_storeu_ps( eta_pt+42, eta22_lo[1] );
+    _mm_storeu_ps( eta_pt+44, eta22_hi[1] );
+    
+    eta_pt += 48;
+  }
+}
+
+
+void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op,
+                     int *neighbor, int start, int end ) {  
+  
+  double *D_pt = ((double*)(op->D))+2*(start*3);
+  double *eta_pt = (double*)(eta+start);
+  double *eta_end_pt = (double*)(eta+end);
+  double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])};
   int *nb_pt = neighbor+((start/12)*4);
   
   __m128d gamma0[4];
@@ -606,41 +1270,477 @@ void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_doubl
           res[3*gamma_co[Z][0]+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+i], buf1 );
         }
         for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 );
+          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) );
+          res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 );
+        }
+      }
+    }
+    // ---------------
+    // mu = Y
+    {
+      __m128d res_re[3];
+      __m128d res_im[3];
+      {
+        __m128d v_re[3];
+        __m128d v_im[3];
+        int j = 2*6*(*nb_pt);
+        
+        for ( int i=0; i<3; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
+          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
+          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          buf_re = _mm_set1_pd( D_pt[2+6*i] );
+          buf_im = _mm_set1_pd( D_pt[3+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          buf_re = _mm_set1_pd( D_pt[4+6*i] );
+          buf_im = _mm_set1_pd( D_pt[5+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+        }
+        D_pt += 18;
+        nb_pt++;
+      }
+      {
+        __m128d in[6];
+        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
+        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
+        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
+        
+        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
+        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
+        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
+        
+        for ( int i=0; i<6; i++ ) {
+          res[i] = _mm_sub_pd( res[i], in[i] );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) );
+          res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) );
+          res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 );
+        }
+      }
+    }
+    // ---------------  
+    // mu = X
+    {
+      __m128d res_re[3];
+      __m128d res_im[3];
+      {
+        __m128d v_re[3];
+        __m128d v_im[3];
+        int j = 2*6*(*nb_pt);
+        
+        for ( int i=0; i<3; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
+          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
+          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          buf_re = _mm_set1_pd( D_pt[2+6*i] );
+          buf_im = _mm_set1_pd( D_pt[3+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          buf_re = _mm_set1_pd( D_pt[4+6*i] );
+          buf_im = _mm_set1_pd( D_pt[5+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+        }
+        D_pt += 18;
+        nb_pt++;
+      }
+      {
+        __m128d in[6];
+        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
+        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
+        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
+        
+        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
+        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
+        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
+        
+        for ( int i=0; i<6; i++ ) {
+          res[i] = _mm_sub_pd( res[i], in[i] );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) );
+          res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) );
+          res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 );
+        }
+      }
+    }
+    // ---------------
+    
+    for ( int i=0; i<12; i++ ) {
+      _mm_storeu_pd( eta_pt + 2*i, res[i] );
+    }
+    eta_pt+=24;
+  }
+  
+}
+
+
+void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op,
+                     int *neighbor, int start, int end ) {
+  
+  float *D_pt = (float*)(op->D_vectorized+2*(start*4));
+  float *eta_pt = (float*)(eta+start);
+  float *eta_end_pt = (float*)(eta+end);
+  float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])};
+  int *nb_pt = neighbor+((start/12)*4);
+  
+  __m128 gamma0[4][2];
+  __m128 gamma1[4][2];
+  
+  for ( int mu=0; mu<4; mu++ ) {
+    gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] );
+    gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] );
+    gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] );
+    gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] );
+  }
+  
+  while( eta_pt < eta_end_pt ) {
+    
+    __m128 eta_lo1 = _mm_loadu_ps( eta_pt );
+    __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 );
+    __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 );
+    __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 );
+    
+    __m128 eta2_lo[2];
+    __m128 eta2_hi[2];
+    
+    eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 );
+    eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 );
+    eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 );
+    eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 );
+    
+    for ( int mu=0; mu<4; mu++ ) {
+      __m128 res1[2];
+      __m128 res2[2];
+      
+      {
+        int j = 2*6*(*nb_pt);
+        // load 1st part of su(3) matrix and multiply
+        {
+          __m128 buf1 = _mm_loadu_ps( D_pt );
+          __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float );
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) );
+            cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
+          }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) );
+            cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
+          }
+        }
+        // load 2nd part of su(3) matrix and multiply
+        {
+          __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float );
+          __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float );
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) );
+            cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
+          }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) );
+            cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
+          }
+        }
+        // load 3rd part of su(3) matrix and multiply
+        {
+          __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float );
+          __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float );
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) );
+            cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] );
+          }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) );
+            cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
+          }
+        }
+      }
+            
+      {
+        // store spin0 contribution
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] );
+          __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] );
+          eta_lo1 = _mm_sub_ps( eta_lo1, buf1 );
+          eta_lo2 = _mm_sub_ps( eta_lo2, buf2 );
+        }
+        
+        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+        __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] );
+        __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] );
+        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+        eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 );
+        eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 );
+      }
+      
+      {
+        // store spin1 contribution
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] );
+          __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] );
+          eta_hi1 = _mm_sub_ps( eta_hi1, buf1 );
+          eta_hi2 = _mm_sub_ps( eta_hi2, buf2 );
+        }
+        
+        // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+        __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] );
+        __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] );
+        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+        eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 );
+        eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 );
+      }
+      
+      nb_pt++;
+      D_pt += 24;
+    }
+    
+    _mm_storeu_ps( eta_pt, eta_lo1 );
+    _mm_storeu_ps( eta_pt+4, eta_lo2 );
+    _mm_storeu_ps( eta_pt+6, eta_hi1 );
+    _mm_storeu_ps( eta_pt+10, eta_hi2 );
+    _mm_storeu_ps( eta_pt+12, eta2_lo[0] );
+    _mm_storeu_ps( eta_pt+14, eta2_hi[0] );
+    _mm_storeu_ps( eta_pt+18, eta2_lo[1] );
+    _mm_storeu_ps( eta_pt+20, eta2_hi[1] );
+  
+    eta_pt += 24;
+  }
+  
+}
+
+
+void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op,
+                     int *neighbor, int start, int end ) {  
+  
+  double *D_pt = ((double*)(op->D))+2*(start/24*36);
+  double *eta_pt = (double*)(eta+start);
+  double *eta_end_pt = (double*)(eta+end);
+  double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])};
+  int *nb_pt = neighbor+((start/24)*4);
+  
+  __m128d gamma0[4];
+  __m128d gamma1[4];
+  
+  for ( int mu=0; mu<4; mu++ ) {
+    gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] );
+    gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] );
+  }
+  
+  while( eta_pt < eta_end_pt ) {
+    
+    __m128d res[24];
+    for ( int i=0; i<24; i++ ) {
+      res[i] = _mm_loadu_pd( eta_pt + 2*i );
+    }
+    
+    // ---------------
+    // mu = T
+    {
+      __m128d res_re[6];
+      __m128d res_im[6];
+      {
+        __m128d v_re[6];
+        __m128d v_im[6];
+        int j = 2*12*(*nb_pt);
+        
+        for ( int i=0; i<3; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) );
+        }
+        for ( int i=3; i<6; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[T]+j+6+2*i), *(pr[T]+j+12+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[T]+j+7+2*i), *(pr[T]+j+13+2*i) );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
+          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
+          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
+          buf_re = _mm_set1_pd( D_pt[2+6*i] );
+          buf_im = _mm_set1_pd( D_pt[3+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
+          buf_re = _mm_set1_pd( D_pt[4+6*i] );
+          buf_im = _mm_set1_pd( D_pt[5+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
+        }
+        D_pt += 18;
+        nb_pt++;
+      }
+      {
+        __m128d in[12];
+        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
+        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
+        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
+        
+        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
+        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
+        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
+        
+        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
+        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
+        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
+        
+        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
+        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
+        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
+        
+        for ( int i=0; i<12; i++ ) {
+          res[i] = _mm_sub_pd( res[i], in[i] );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) );
+          res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+3]) );
+          res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i+6]) );
+          res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+9]) );
+          res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i], buf1 );
+        }
+      }
+    }
+    // ---------------
+    // mu = Z
+    {
+      __m128d res_re[6];
+      __m128d res_im[6];
+      {
+        __m128d v_re[6];
+        __m128d v_im[6];
+        int j = 2*12*(*nb_pt);
+        
+        for ( int i=0; i<3; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) );
+        }
+        for ( int i=3; i<6; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[Z]+j+6+2*i), *(pr[Z]+j+12+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[Z]+j+7+2*i), *(pr[Z]+j+13+2*i) );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
+          __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
+          cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
+          buf_re = _mm_set1_pd( D_pt[2+6*i] );
+          buf_im = _mm_set1_pd( D_pt[3+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
+          buf_re = _mm_set1_pd( D_pt[4+6*i] );
+          buf_im = _mm_set1_pd( D_pt[5+6*i] );
+          cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
+        }
+        D_pt += 18;
+        nb_pt++;
+      }
+      {
+        __m128d in[12];
+        in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
+        in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
+        in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
+        
+        in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] );
+        in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
+        in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
+        
+        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
+        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
+        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
+        
+        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
+        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
+        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
+        
+        for ( int i=0; i<12; i++ ) {
+          res[i] = _mm_sub_pd( res[i], in[i] );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) );
+          res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+3]) );
+          res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i+6]) );
+          res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+9]) );
+          res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i], buf1 );
         }
       }
     }
     // ---------------
     // mu = Y
     {
-      __m128d res_re[3];
-      __m128d res_im[3];
+      __m128d res_re[6];
+      __m128d res_im[6];
       {
-        __m128d v_re[3];
-        __m128d v_im[3];
-        int j = 2*6*(*nb_pt);
+        __m128d v_re[6];
+        __m128d v_im[6];
+        int j = 2*12*(*nb_pt);
         
         for ( int i=0; i<3; i++ )  {
           v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) );
           v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) );
         }
+        for ( int i=3; i<6; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[Y]+j+6+2*i), *(pr[Y]+j+12+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[Y]+j+7+2*i), *(pr[Y]+j+13+2*i) );
+        }
         for ( int i=0; i<3; i++ ) {
           __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
           __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
           cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
           buf_re = _mm_set1_pd( D_pt[2+6*i] );
           buf_im = _mm_set1_pd( D_pt[3+6*i] );
           cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
           buf_re = _mm_set1_pd( D_pt[4+6*i] );
           buf_im = _mm_set1_pd( D_pt[5+6*i] );
           cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
         }
         D_pt += 18;
         nb_pt++;
       }
       {
-        __m128d in[6];
+        __m128d in[12];
         in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
         in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
         in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
@@ -649,49 +1749,72 @@ void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_doubl
         in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
         in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
         
-        for ( int i=0; i<6; i++ ) {
+        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
+        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
+        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
+        
+        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
+        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
+        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
+        
+        for ( int i=0; i<12; i++ ) {
           res[i] = _mm_sub_pd( res[i], in[i] );
         }
         for ( int i=0; i<3; i++ ) {
           __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) );
-          res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 );
+          res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i], buf1 );
         }
         for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 );
+          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+3]) );
+          res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i+6]) );
+          res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+9]) );
+          res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i], buf1 );
         }
       }
     }
-    // ---------------  
+    // ---------------
     // mu = X
     {
-      __m128d res_re[3];
-      __m128d res_im[3];
+      __m128d res_re[6];
+      __m128d res_im[6];
       {
-        __m128d v_re[3];
-        __m128d v_im[3];
-        int j = 2*6*(*nb_pt);
+        __m128d v_re[6];
+        __m128d v_im[6];
+        int j = 2*12*(*nb_pt);
         
         for ( int i=0; i<3; i++ )  {
           v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) );
           v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) );
         }
+        for ( int i=3; i<6; i++ )  {
+          v_re[i] = _mm_setr_pd( *(pr[X]+j+6+2*i), *(pr[X]+j+12+2*i) );
+          v_im[i] = _mm_setr_pd( *(pr[X]+j+7+2*i), *(pr[X]+j+13+2*i) );
+        }
         for ( int i=0; i<3; i++ ) {
           __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] );
           __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] );
           cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] );
+          cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] );
           buf_re = _mm_set1_pd( D_pt[2+6*i] );
           buf_im = _mm_set1_pd( D_pt[3+6*i] );
           cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] );
           buf_re = _mm_set1_pd( D_pt[4+6*i] );
           buf_im = _mm_set1_pd( D_pt[5+6*i] );
           cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] );
+          cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] );
         }
         D_pt += 18;
         nb_pt++;
       }
       {
-        __m128d in[6];
+        __m128d in[12];
         in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] );
         in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] );
         in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] );
@@ -700,38 +1823,54 @@ void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_doubl
         in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] );
         in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] );
         
-        for ( int i=0; i<6; i++ ) {
+        in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] );
+        in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] );
+        in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] );
+        
+        in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] );
+        in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] );
+        in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] );
+        
+        for ( int i=0; i<12; i++ ) {
           res[i] = _mm_sub_pd( res[i], in[i] );
         }
         for ( int i=0; i<3; i++ ) {
           __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) );
-          res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 );
+          res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i], buf1 );
         }
         for ( int i=0; i<3; i++ ) {
-          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) );
-          res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 );
+          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+3]) );
+          res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i+6]) );
+          res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i], buf1 );
+        }
+        for ( int i=0; i<3; i++ ) {
+          __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+9]) );
+          res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i], buf1 );
         }
       }
     }
     // ---------------
     
-    for ( int i=0; i<12; i++ ) {
+    for ( int i=0; i<24; i++ ) {
       _mm_storeu_pd( eta_pt + 2*i, res[i] );
     }
-    eta_pt+=24;
+    eta_pt+=48;
   }
   
 }
 
 
-void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op,
+void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op,
                      int *neighbor, int start, int end ) {
   
-  float *D_pt = (float*)(op->D_vectorized+2*(start*4));
+  float *D_pt = (float*)(op->D_vectorized+2*(start/24*48));
   float *eta_pt = (float*)(eta+start);
   float *eta_end_pt = (float*)(eta+end);
   float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])};
-  int *nb_pt = neighbor+((start/12)*4);
+  int *nb_pt = neighbor+((start/24)*4);
   
   __m128 gamma0[4][2];
   __m128 gamma1[4][2];
@@ -749,21 +1888,29 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
     __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 );
     __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 );
     __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 );
+    __m128 eta_lo3 = _mm_loadu_ps( eta_pt + 12 );
+    __m128 eta_lo4 = _mm_loadu_ps( eta_pt + 16 );
+    __m128 eta_hi3 = _mm_loadu_ps( eta_pt + 18 );
+    __m128 eta_hi4 = _mm_loadu_ps( eta_pt + 22 );
     
-    __m128 eta2_lo[2];
-    __m128 eta2_hi[2];
+    __m128 eta2_lo[4];
+    __m128 eta2_hi[4];
     
-    eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 );
-    eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 );
-    eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 );
-    eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 );
+    eta2_lo[0] = _mm_loadu_ps( eta_pt + 24 );
+    eta2_hi[0] = _mm_loadu_ps( eta_pt + 26 );
+    eta2_lo[1] = _mm_loadu_ps( eta_pt + 30 );
+    eta2_hi[1] = _mm_loadu_ps( eta_pt + 32 );
+    eta2_lo[2] = _mm_loadu_ps( eta_pt + 36 );
+    eta2_hi[2] = _mm_loadu_ps( eta_pt + 38 );
+    eta2_lo[3] = _mm_loadu_ps( eta_pt + 42 );
+    eta2_hi[3] = _mm_loadu_ps( eta_pt + 44 );
     
     for ( int mu=0; mu<4; mu++ ) {
-      __m128 res1[2];
-      __m128 res2[2];
+      __m128 res1[4];
+      __m128 res2[4];
       
       {
-        int j = 2*6*(*nb_pt);
+        int j = 2*12*(*nb_pt);
         // load 1st part of su(3) matrix and multiply
         {
           __m128 buf1 = _mm_loadu_ps( D_pt );
@@ -778,6 +1925,16 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
             __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) );
             cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
           }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+12) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+13) );
+            cmul( buf1, buf2, buf3, buf4, &res1[2], &res1[3] );
+          }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+18) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+19) );
+            cmul( buf1, buf2, buf3, buf4, &res2[2], &res2[3] );
+          }
         }
         // load 2nd part of su(3) matrix and multiply
         {
@@ -793,6 +1950,16 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
             __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) );
             cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
           }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+14) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+15) );
+            cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] );
+          }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+20) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+21) );
+            cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] );
+          }
         }
         // load 3rd part of su(3) matrix and multiply
         {
@@ -808,6 +1975,16 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
             __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) );
             cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] );
           }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+16) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+17) );
+            cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] );
+          }
+          {
+            __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+22) );
+            __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+23) );
+            cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] );
+          }
         }
       }
             
@@ -819,17 +1996,33 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
           eta_lo1 = _mm_sub_ps( eta_lo1, buf1 );
           eta_lo2 = _mm_sub_ps( eta_lo2, buf2 );
         }
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res1[2], res1[3] );
+          __m128 buf2 = _mm_unpackhi_ps( res1[2], res1[3] );
+          eta_lo3 = _mm_sub_ps( eta_lo3, buf1 );
+          eta_lo4 = _mm_sub_ps( eta_lo4, buf2 );
+        }
         
         // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] );
-        __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] );
-        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 );
-        eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 );
+        {
+          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] );
+          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 );
+          eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 );
+        }
+        {
+          __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[2+gamma_offset[mu][0]] );
+          __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[3-gamma_offset[mu][0]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta2_lo[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][2]], buf3 );
+          eta2_hi[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][2]], buf4 );
+        }
       }
-      
       {
         // store spin1 contribution
         {
@@ -838,17 +2031,33 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
           eta_hi1 = _mm_sub_ps( eta_hi1, buf1 );
           eta_hi2 = _mm_sub_ps( eta_hi2, buf2 );
         }
+        {
+          __m128 buf1 = _mm_unpacklo_ps( res2[2], res2[3] );
+          __m128 buf2 = _mm_unpackhi_ps( res2[2], res2[3] );
+          eta_hi3 = _mm_sub_ps( eta_hi3, buf1 );
+          eta_hi4 = _mm_sub_ps( eta_hi4, buf2 );
+        }
         
         // store contribution from 1st SU(3) multiplication to either spin2 or spin3
-        __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] );
-        __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] );
-        __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
-        __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
-        buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
-        eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 );
-        eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 );
+        {
+          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] );
+          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 );
+          eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 );
+        }
+        {
+          __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[2+gamma_offset[mu][1]] );
+          __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[3-gamma_offset[mu][1]] );
+          __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          eta2_lo[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][3]], buf3 );
+          eta2_hi[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][3]], buf4 );
+        }
       }
-      
       nb_pt++;
       D_pt += 24;
     }
@@ -857,18 +2066,25 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st
     _mm_storeu_ps( eta_pt+4, eta_lo2 );
     _mm_storeu_ps( eta_pt+6, eta_hi1 );
     _mm_storeu_ps( eta_pt+10, eta_hi2 );
-    _mm_storeu_ps( eta_pt+12, eta2_lo[0] );
-    _mm_storeu_ps( eta_pt+14, eta2_hi[0] );
-    _mm_storeu_ps( eta_pt+18, eta2_lo[1] );
-    _mm_storeu_ps( eta_pt+20, eta2_hi[1] );
+    _mm_storeu_ps( eta_pt+12, eta_lo3 );
+    _mm_storeu_ps( eta_pt+16, eta_lo4 );
+    _mm_storeu_ps( eta_pt+18, eta_hi3 );
+    _mm_storeu_ps( eta_pt+22, eta_hi4 );
+    _mm_storeu_ps( eta_pt+24, eta2_lo[0] );
+    _mm_storeu_ps( eta_pt+26, eta2_hi[0] );
+    _mm_storeu_ps( eta_pt+30, eta2_lo[1] );
+    _mm_storeu_ps( eta_pt+32, eta2_hi[1] );
+    _mm_storeu_ps( eta_pt+36, eta2_lo[2] );
+    _mm_storeu_ps( eta_pt+38, eta2_hi[2] );
+    _mm_storeu_ps( eta_pt+42, eta2_lo[3] );
+    _mm_storeu_ps( eta_pt+44, eta2_hi[3] );
   
-    eta_pt += 24;
+    eta_pt += 48;
   }
   
 }
 
 
-
 void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { }
 void block_oddeven_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) {
 #define UPD _mm_sub_ps
@@ -1266,7 +2482,7 @@ static inline int sse_clover_imag_index( int i, int j ) {
 
 void sse_set_clover_double( double *out, complex_double *in ) { }
 
-void sse_set_clover_float( float *out, complex_double *in ) {
+void sse_set_clover_float( float *out, complex_float *in ) {
     
   int index;
   float sign = 0.0;
@@ -1274,7 +2490,7 @@ void sse_set_clover_float( float *out, complex_double *in ) {
     for ( int j=0; j<6; j++ ) {
       for ( int i=0; i<SIMD_LENGTH_float; i++ ) {
         if ( i+k == j || i+k-6 == j ) {
-	  // diagonal entry i+k,i+k
+          // diagonal entry i+k,i+k
           index = i+k;
           sign = 1.0;
         } else if ( i+k<6 ) {
@@ -1309,40 +2525,80 @@ void sse_set_clover_float( float *out, complex_double *in ) {
   }
 }
 
-void sse_clover_double( vector_double eta, vector_double phi, operator_double_struct *op, int start, int end,
-                        level_struct *l, struct Thread *threading ) {
-  
-  vector_double lphi = phi+start, leta = eta+start;
-  config_double clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42;
-
-  clover_double( leta, lphi, clover, end-start, l, threading ); 
-#ifdef HAVE_TM
-  add_diagonal_double( leta, lphi, op->tm_term+start, end-start );
-#endif
+void sse_set_clover_doublet_double( double *out, complex_double *in ) { }
 
+void sse_set_clover_doublet_float( float *out, complex_float *in ) {
+    
+  int index, d;
+  float sign = 0.0;
+  for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
+    for ( int j=0; j<6; j++ ) {
+      for ( int i=0; i<SIMD_LENGTH_float; i++ ) {
+        if ( i+k == j || i+k-6 == j ) {
+          // diagonal entry i+k,i+k
+          index = i+k;
+          sign = 1.0;
+        } else if ( i+k<6 ) {
+          // first 6-by-6 matrix
+          if ( j > i+k ) {
+            // upper triangle
+            index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1));
+            sign = 1.0;
+          } else {
+            // lower triangle, j < i+k
+            index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1));
+            sign = -1.0;
+          } 
+        } else {
+          // i+k >= 6
+          // second 6-by-6 matrix
+          if ( j > i+k-6 ) {
+            // upper triangle
+            index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1));
+            sign = 1.0;
+          } else {
+            // j < i+k-6
+            // lower triangle
+            index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1));
+            sign = -1.0;
+          }
+        }
+        d=(i+k<6)?0:6;
+        out[ sse_clover_real_index(i+k+d,j) ] = creal_float( in[index] );
+        out[ sse_clover_imag_index(i+k+d,j) ] = sign*cimag_float( in[index] );
+        out[ sse_clover_real_index(i+k+d+6,j) ] = creal_float( in[index] );
+        out[ sse_clover_imag_index(i+k+d+6,j) ] = sign*cimag_float( in[index] );
+      }
+    }
+  }
 }
 
+void sse_add_diagonal_clover_double( double *out, complex_double *diag ) { }
 
-void sse_clover_float( vector_float eta, vector_float phi, operator_float_struct *op, int start, int end,
-                       level_struct *l, struct Thread *threading ) {
+void sse_add_diagonal_clover_float( float *out, complex_float *diag ) {
+  for ( int k=0; k<12; k++ ) {
+    out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] );
+    out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] );
+  }
+}
 
-  if ( g.csw == 0.0 ) {
-    vector_float lphi = phi+start, leta = eta+start;
-    config_float clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42;
+void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag ) { }
 
-    clover_float( leta, lphi, clover, end-start, l, threading ); 
-#ifdef HAVE_TM
-    add_diagonal_float( leta, lphi, op->tm_term+start, end-start );
-#endif
-  } else {
-    float *clov = op->clover_vectorized;
-    for ( int i=start; i<end; i+=12 ) {
-      sse_site_clover_float( (float*)(eta+i), (float*)(phi+i), clov+12*i );
-    }
+void sse_add_diagonal_clover_doublet_float( float *out, complex_float *diag ) {
+  for ( int k=0; k<6; k++ ) {
+    out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] );
+    out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] );
+    out[ sse_clover_real_index(k+6,k%6) ] -= creal_float( diag[k] );
+    out[ sse_clover_imag_index(k+6,k%6) ] -= cimag_float( diag[k] );
+  }
+  for ( int k=6; k<12; k++ ) {
+    out[ sse_clover_real_index(k+6,k%6) ] += creal_float( diag[k] );
+    out[ sse_clover_imag_index(k+6,k%6) ] += cimag_float( diag[k] );
+    out[ sse_clover_real_index(k+12,k%6) ] -= creal_float( diag[k] );
+    out[ sse_clover_imag_index(k+12,k%6) ] -= cimag_float( diag[k] );
   }
 }
 
-
 void sse_site_clover_double( double *eta, const double *phi, const double *clover ) { 
 
 }
@@ -1358,65 +2614,251 @@ void sse_site_clover_float( float *eta, const float *phi, float *clover ) {
   __m128 out_re;
   __m128 out_im;
   
-  // lines 1--4; indeces from 0 to 47
-  in_re = _mm_set1_ps( phi[0] );
-  in_im = _mm_set1_ps( phi[1] );
-  clov_re = _mm_load_ps( clover );
-  clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-  cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-  clover+=2*SIMD_LENGTH_float;
-  
-  for ( int i=1; i<6; i++ ) {
-    in_re = _mm_set1_ps( phi[2*i] );
-    in_im = _mm_set1_ps( phi[2*i+1] );
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+    // lines 1--4; indeces from 0 to 47
+    in_re = _mm_set1_ps( phi[0] );
+    in_im = _mm_set1_ps( phi[1] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i] );
+      in_im = _mm_set1_ps( phi[2*i+1] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta );
+    
+    // lines 5--8; indeces from 48 to 95 
+    in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] );
+    in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] );
+      in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+8 );
+    
+    // lines 9--12; indeces from 96 to 143
+    in_re = _mm_set1_ps( phi[12] );
+    in_im = _mm_set1_ps( phi[13] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i+12] );
+      in_im = _mm_set1_ps( phi[2*i+13] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+16 );
+
+    // lines 13--16; indeces from 144 to 191
+    in_re = _mm_set1_ps( phi[24] );
+    in_im = _mm_set1_ps( phi[25] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i+24] );
+      in_im = _mm_set1_ps( phi[2*i+25] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+24 );
+    
+    // lines 17--20; indeces from 192 to 239 
+    in_re = _mm_setr_ps( phi[24], phi[24], phi[36], phi[36] );
+    in_im = _mm_setr_ps( phi[25], phi[25], phi[37], phi[37] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_setr_ps( phi[2*i+24], phi[2*i+24], phi[2*i+36], phi[2*i+36] );
+      in_im = _mm_setr_ps( phi[2*i+25], phi[2*i+25], phi[2*i+37], phi[2*i+37] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+32 );
+    
+    // lines 21--24; indeces from 240 to 287
+    in_re = _mm_set1_ps( phi[36] );
+    in_im = _mm_set1_ps( phi[37] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i+36] );
+      in_im = _mm_set1_ps( phi[2*i+37] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+40 );
+
+  } else {
+#endif
+    // lines 1--4; indeces from 0 to 47
+    in_re = _mm_set1_ps( phi[0] );
+    in_im = _mm_set1_ps( phi[1] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i] );
+      in_im = _mm_set1_ps( phi[2*i+1] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta );
+    
+    // lines 5--8; indeces from 48 to 95 
+    in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] );
+    in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] );
+    clov_re = _mm_load_ps( clover );
+    clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] );
+      in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+8 );
+    
+    // lines 9--12; indeces from 96 to 143
+    in_re = _mm_set1_ps( phi[12] );
+    in_im = _mm_set1_ps( phi[13] );
     clov_re = _mm_load_ps( clover );
     clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
     clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<6; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i+12] );
+      in_im = _mm_set1_ps( phi[2*i+13] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+    
+    sse_complex_interleaved_store( out_re, out_im, eta+16 );
+#ifdef HAVE_TM1p1
   }
+#endif
+
+}
+
+void sse_site_clover_doublet_double( double *eta, const double *phi, const double *clover ) { 
+
+}
+
+void sse_site_clover_doublet_float( float *eta, const float *phi, float *clover ) {
+  
+  __m128 in_re;
+  __m128 in_im;
+  
+  __m128 clov_re;
+  __m128 clov_im;
   
-  sse_complex_interleaved_store( out_re, out_im, eta );
+  __m128 out_re;
+  __m128 out_im;
   
+  // lines 1--4; indeces from 0 to 47
   // lines 5--8; indeces from 48 to 95 
-  in_re = _mm_setr_ps( phi[0], phi[0], phi[12], phi[12] );
-  in_im = _mm_setr_ps( phi[1], phi[1], phi[13], phi[13] );
-  clov_re = _mm_load_ps( clover );
-  clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-  cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-  clover+=2*SIMD_LENGTH_float;
-  
-  for ( int i=1; i<6; i++ ) {
-    in_re = _mm_setr_ps( phi[2*i], phi[2*i], phi[2*i+12], phi[2*i+12] );
-    in_im = _mm_setr_ps( phi[2*i+1], phi[2*i+1], phi[2*i+13], phi[2*i+13] );
+  // lines 9--12; indeces from 96 to 143
+  for( int n=0; n<3; n++ ) {
+    in_re = _mm_set1_ps( phi[0] );
+    in_im = _mm_set1_ps( phi[1] );
     clov_re = _mm_load_ps( clover );
     clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
     clover+=2*SIMD_LENGTH_float;
+    
+    for ( int i=1; i<12; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i] );
+      in_im = _mm_set1_ps( phi[2*i+1] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
+  
+    sse_complex_interleaved_store( out_re, out_im, eta + n*8 );
   }
   
-  sse_complex_interleaved_store( out_re, out_im, eta+8 );
   
-  // lines 9--12; indeces from 96 to 143
-  in_re = _mm_set1_ps( phi[12] );
-  in_im = _mm_set1_ps( phi[13] );
-  clov_re = _mm_load_ps( clover );
-  clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-  cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
-  clover+=2*SIMD_LENGTH_float;
-  
-  for ( int i=1; i<6; i++ ) {
-    in_re = _mm_set1_ps( phi[2*i+12] );
-    in_im = _mm_set1_ps( phi[2*i+13] );
+  // lines 13--16; indeces from 144 to 191
+  // lines 17--20; indeces from 192 to 239 
+  // lines 21--24; indeces from 240 to 287
+  for( int n=3; n<6; n++ ) {
+    in_re = _mm_set1_ps( phi[24] );
+    in_im = _mm_set1_ps( phi[25] );
     clov_re = _mm_load_ps( clover );
     clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
-    cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+    cmul( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
     clover+=2*SIMD_LENGTH_float;
-  }
+    
+    for ( int i=1; i<12; i++ ) {
+      in_re = _mm_set1_ps( phi[2*i+24] );
+      in_im = _mm_set1_ps( phi[2*i+25] );
+      clov_re = _mm_load_ps( clover );
+      clov_im = _mm_load_ps( clover+SIMD_LENGTH_float );
+      cfmadd( clov_re, clov_im, in_re, in_im, &out_re, &out_im );
+      clover+=2*SIMD_LENGTH_float;
+    }
   
-  sse_complex_interleaved_store( out_re, out_im, eta+16 );
+    sse_complex_interleaved_store( out_re, out_im, eta + n*8 );
+  }
 }
 
 
+
 void sse_site_clover_invert_double( double *clover_in, double *clover_out ) { }
 
 void sse_site_clover_invert_float( float *clover_in, float *clover_out ) {
@@ -1439,8 +2881,8 @@ void sse_site_clover_invert_float( float *clover_in, float *clover_out ) {
     }
   }
   
-  sse_cgem_inverse( 6, M_tmp1, M_tmp1, 6 );  
-  sse_cgem_inverse( 6, M_tmp2, M_tmp2, 6 );
+  cgem_inverse_float( 6, M_tmp1, M_tmp1, 6 );  
+  cgem_inverse_float( 6, M_tmp2, M_tmp2, 6 );
   
   for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
     for ( int j=0; j<6; j++ ) {
@@ -1459,13 +2901,70 @@ void sse_site_clover_invert_float( float *clover_in, float *clover_out ) {
   }
 }
 
-void sse_add_diagonal_clover_double( double *out, complex_double *diag ) { }
+void sse_site_clover_doublet_invert_double( double *clover_in, config_double eps_term, double *clover_out ) { }
 
-void sse_add_diagonal_clover_float( float *out, complex_double *diag ) {
-  for ( int k=0; k<12; k++ ) {
-    out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] );
-    out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] );
+void sse_site_clover_doublet_invert_float( float *clover_in, config_float eps_term, float *clover_out ) {
+  
+  float M_tmp[2*288];
+
+  if(g.csw)
+    for ( int n=0; n<2; n++ )  
+      for ( int k=0; k<12; k+=SIMD_LENGTH_float )
+        for ( int j=0; j<6; j++ ) {
+          for ( int i=k; i<k+SIMD_LENGTH_float; i++ ) {
+            if(i<6) {
+              M_tmp[288*n+24*j+i] = *clover_in;
+              M_tmp[288*n+24*j+i+12] = *(clover_in+SIMD_LENGTH_float);
+            } else {
+              M_tmp[288*n+24*(j+6)+i] = *clover_in;
+              M_tmp[288*n+24*(j+6)+i+12] = *(clover_in+SIMD_LENGTH_float);
+            }
+            clover_in++;
+          }
+          clover_in += SIMD_LENGTH_float;
+        }
+  else {
+    for ( int n=0; n<2; n++ )  
+      for ( int j=0; j<6; j++ )
+        for ( int i=0; i<6; i++ ) {
+          M_tmp[288*n+24*(j+6)+i+6]  = M_tmp[288*n+24*j+i]  = 0; // re 
+          M_tmp[288*n+24*(j+6)+i+18] = M_tmp[288*n+24*j+i+12] = 0; // im
+        }  
+
+    for ( int n=0; n<2; n++ )  
+      for ( int j=0; j<12; j++ ) {
+        M_tmp[288*n+24*j+j]    = clover_in[24*n+2*j];
+        M_tmp[288*n+24*j+j+12] = clover_in[24*n+2*j+1];
+      }
   }
+
+  for ( int n=0; n<2; n++ )  
+    for ( int j=0; j<6; j++ )
+      for ( int i=0; i<6; i++ ) {
+        M_tmp[288*n+24*(j+6)+i]    = M_tmp[288*n+24*j+i+6]  = 0; // re 
+        M_tmp[288*n+24*(j+6)+i+12] = M_tmp[288*n+24*j+i+18] = 0; // im
+      }  
+
+  for ( int n=0; n<2; n++ )  
+    for ( int j=0; j<6; j++ ) {
+      M_tmp[288*n+24*(j+6)+j]    = M_tmp[288*n+24*j+j+6]  = creal(eps_term[6*n+j]);
+      M_tmp[288*n+24*(j+6)+j+12] = M_tmp[288*n+24*j+j+18] = cimag(eps_term[6*n+j]);
+    }
+  
+  cgem_inverse_float( 12, M_tmp, M_tmp, 12 );  
+  cgem_inverse_float( 12, M_tmp+288, M_tmp+288, 12 );
+  
+  for ( int n=0; n<2; n++ )  
+    for ( int k=0; k<12; k+=SIMD_LENGTH_float ) {
+      for ( int j=0; j<12; j++ ) {
+        for ( int i=k; i<k+SIMD_LENGTH_float; i++ ) {
+          *clover_out = M_tmp[288*n+24*j+i];
+          *(clover_out+SIMD_LENGTH_float) = M_tmp[288*n+24*j+i+12];
+          clover_out++;
+        }
+        clover_out += SIMD_LENGTH_float;
+      }
+    }
 }
 
 
diff --git a/src/sse_dirac.h b/src/sse_dirac.h
index 070eae5..032081b 100644
--- a/src/sse_dirac.h
+++ b/src/sse_dirac.h
@@ -32,6 +32,15 @@ void pbn_float( complex_float *eta, complex_float *prp[4], int start, int end );
 void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, int *neighbor, int start, int end );
 void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, int *neighbor, int start, int end );
 
+void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end );
+void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end );
+void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end );
+void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end );
+void dpbn_double( complex_double *eta, complex_double *prp[4], int start, int end );
+void dpbn_float( complex_float *eta, complex_float *prp[4], int start, int end );
+void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, int *neighbor, int start, int end );
+void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, int *neighbor, int start, int end );
+
 void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu,
                                          int start, int end, int *ind, int *neighbor );
 void block_oddeven_plus_coupling_float( float *eta, float *D, float *phi, int mu,
@@ -66,17 +75,25 @@ void boundary_nplus_coupling_float( float *eta, float *D, float *phi, int mu,
                                             int start, int end, int *ind, int *neighbor );
 
 void sse_set_clover_double( double *out, complex_double *in );
-void sse_set_clover_float( float *out, complex_double *in );
-void sse_site_clover_double( double *eta, const double *phi, const double *clover );
-void sse_site_clover_float( float *eta, const float *phi, float *clover );
+void sse_set_clover_float( float *out, complex_float *in );
+void sse_set_clover_doublet_double( double *out, complex_double *in );
+void sse_set_clover_doublet_float( float *out, complex_float *in );
+void sse_add_diagonal_clover_double( double *out, complex_double *diag );
+void sse_add_diagonal_clover_float( float *out, complex_float *diag );
+void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag );
+void sse_add_diagonal_clover_doublet_float( float *out, complex_float *diag );
 void sse_clover_double( vector_double eta, vector_double phi, operator_double_struct *op, int start, int end, level_struct *l, struct Thread *threading );
 void sse_clover_float( vector_float eta, vector_float phi, operator_float_struct *op, int start, int end, level_struct *l, struct Thread *threading );
+void sse_site_clover_double( double *eta, const double *phi, const double *clover );
+void sse_site_clover_float( float *eta, const float *phi, float *clover );
+void sse_site_clover_doublet_double( double *eta, const double *phi, const double *clover );
+void sse_site_clover_doublet_float( float *eta, const float *phi, float *clover );
 
 void sse_site_clover_invert_double( double *clover_in, double *clover_out );
 void sse_site_clover_invert_float( float *clover_in, float *clover_out );
+void sse_site_clover_doublet_invert_double( double *clover_in, config_double eps_term, double *clover_out );
+void sse_site_clover_doublet_invert_float( float *clover_in, config_float eps_term, float *clover_out );
 
-void sse_add_diagonal_clover_double( double *out, complex_double *diag );
-void sse_add_diagonal_clover_float( float *out, complex_double *diag );
 
 static inline void sse_mvm_double_simd_length( const complex_double *eta, const complex_double *D, const complex_double *phi ) {}
 
@@ -234,8 +251,8 @@ static inline void sse_twospin_float( complex_float *out_spin0and1, complex_floa
   for(int spin=0; spin<4; spin++) {
     if(spin == 2)
       out = out_spin0and1;
-    scale_re = _mm_set1_ps(sign*creal(gamma_val[mu][spin]));
-    scale_im = _mm_set1_ps(sign*cimag(gamma_val[mu][spin]));
+    scale_re = _mm_set1_ps(sign*creal(gamma_val_float[mu][spin]));
+    scale_im = _mm_set1_ps(sign*cimag(gamma_val_float[mu][spin]));
     // factors of 2 are for complex
     for(int j=0; j<3; j++) {
       for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
@@ -282,8 +299,8 @@ static inline void sse_twospin2_p_float_simd_length( complex_float *out_spin0and
   for(int spin=0; spin<4; spin++) {
     if(spin == 2)
       out = out_spin0and1;
-    scale_re = _mm_set1_ps(-creal(gamma_val[mu][spin]));
-    scale_im = _mm_set1_ps(-cimag(gamma_val[mu][spin]));
+    scale_re = _mm_set1_ps(-creal(gamma_val_float[mu][spin]));
+    scale_im = _mm_set1_ps(-cimag(gamma_val_float[mu][spin]));
     // factors of 2 are for complex
     for(int j=0; j<3; j++) {
       __m128 in_re  = _mm_load_ps((float *)in   + (2*(3*gamma_co[mu][spin]+j)+0)*elements);
@@ -328,8 +345,8 @@ static inline void sse_twospin2_p_float( complex_float *out_spin0and1, complex_f
   for(int spin=0; spin<4; spin++) {
     if(spin == 2)
       out = out_spin0and1;
-    scale_re = _mm_set1_ps(-creal(gamma_val[mu][spin]));
-    scale_im = _mm_set1_ps(-cimag(gamma_val[mu][spin]));
+    scale_re = _mm_set1_ps(-creal(gamma_val_float[mu][spin]));
+    scale_im = _mm_set1_ps(-cimag(gamma_val_float[mu][spin]));
     // factors of 2 are for complex
     for(int j=0; j<3; j++) {
       for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
@@ -445,6 +462,45 @@ static inline void sse_spin0and1_site_clover_float( const complex_float *eta, co
 #endif
 }
 
+static inline void sse_diagonal_aggregate_double( const complex_double *eta1, const complex_double *eta2, const complex_double *phi, const config_double diag, int elements ) {}
+
+static inline void sse_diagonal_aggregate_float( const complex_float *eta1, const complex_float *eta2, const complex_float *phi, const config_float diag, int elements ) {
+#ifdef SSE
+  // offset computations 2*index+0/1 are for real and imaginary parts
+
+  // diagonal
+  for(int i=0; i<elements; i+=SIMD_LENGTH_float) {
+    __m128 zero = _mm_setzero_ps();
+    for(int j=0; j<6; j++) {
+      __m128 factor = _mm_set1_ps(creal(diag[j]));
+      __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
+      __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
+      
+      in_re = _mm_mul_ps( factor, in_re );
+      in_im = _mm_mul_ps( factor, in_im );
+      
+      _mm_store_ps((float *)eta1 + i + (2*j+0)*elements, in_re);
+      _mm_store_ps((float *)eta1 + i + (2*j+1)*elements, in_im);
+      _mm_store_ps((float *)eta2 + i + (2*j+0)*elements, zero);
+      _mm_store_ps((float *)eta2 + i + (2*j+1)*elements, zero);
+    }
+    for(int j=6; j<12; j++) {
+      __m128 factor = _mm_set1_ps(creal(diag[j]));
+      __m128 in_re  = _mm_load_ps((float *)phi + i + (2*j+0)*elements);
+      __m128 in_im  = _mm_load_ps((float *)phi + i + (2*j+1)*elements);
+      
+      in_re = _mm_mul_ps( factor, in_re );
+      in_im = _mm_mul_ps( factor, in_im );
+      
+      _mm_store_ps((float *)eta2 + i + (2*j+0)*elements, in_re);
+      _mm_store_ps((float *)eta2 + i + (2*j+1)*elements, in_im);
+      _mm_store_ps((float *)eta1 + i + (2*j+0)*elements, zero);
+      _mm_store_ps((float *)eta1 + i + (2*j+1)*elements, zero);
+    }
+  }
+#endif
+}
+
 
 static inline void sse_spin2and3_site_clover_double( const complex_double *eta, const complex_double *phi, const config_double clover, double shift, int elements ) {}
 
diff --git a/src/sse_dirac_generic.c b/src/sse_dirac_generic.c
deleted file mode 100644
index 02bd1b0..0000000
--- a/src/sse_dirac_generic.c
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#if defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) || defined(OPTIMIZED_SELF_COUPLING_PRECISION)
-void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
-
-  START_UNTHREADED_FUNCTION(threading)
-  
-  int n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  PRECISION *Dplus = s->op.D_vectorized + (start/12)*96;
-  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/12)*96;
-#else
-  int j, k, *ind;
-  complex_PRECISION buf1[25]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+6, *buf3=buf2+6, *buf4=buf3+6;
-  config_PRECISION D_pt;
-  config_PRECISION D = s->op.D + (start/12)*36;
-#endif
-
-  // clover term
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  sse_clover_PRECISION(eta, phi, &(s->op), start, start+12*n, l, no_threading );
-#else
-  config_PRECISION clover = (g.csw==0.0)?s->op.clover+start:s->op.clover+(start/12)*42;
-  clover_PRECISION( leta, lphi, clover, 12*n, l, no_threading ); 
-#ifdef HAVE_TM
-  add_diagonal_PRECISION( leta, lphi, s->op.tm_term+start, 12*n );
-#endif
-#endif
-  
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  for ( int mu=0; mu<4; mu++ ) {
-    block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor );
-    block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor );
-  }
-#else
-  // inner block couplings
-  ind = index[T]; // T direction
-  for ( i=0; i<length[T]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+T]; D_pt = D + 36*k + 9*T;
-    prn_T_PRECISION( buf1, lphi+12*k ); // (1+gamma_T) phi(x) + projection
-    prp_T_PRECISION( buf2, lphi+12*j ); // (1-gamma_T) phi(x+hat{T}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_T^dagger(x) (1+gamma_T) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_T^dagger(x) (1+gamma_T) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_T(x) (1-gamma_T) phi(x+hat{T}) - projected
-    pbn_su3_T_PRECISION( buf3, leta+12*j ); // eta(x+hat{T}) -= U_T(x)^dagger(x) (1+gamma_T) phi(x) + lift back
-    pbp_su3_T_PRECISION( buf4, leta+12*k ); // eta(x) -= U_T(x) (1-gamma_T) phi(x+hat{T}) + lift back
-  }
-  ind = index[Z]; // Z direction
-  for ( i=0; i<length[Z]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Z]; D_pt = D + 36*k + 9*Z;
-    prn_Z_PRECISION( buf1, lphi+12*k ); // (1+gamma_Z) phi(x) + projection
-    prp_Z_PRECISION( buf2, lphi+12*j ); // (1-gamma_Z) phi(x+hat{Z}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_Z^dagger(x) (1+gamma_Z) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_Z(x) (1-gamma_Z) phi(x+hat{Z}) - projected
-    pbn_su3_Z_PRECISION( buf3, leta+12*j ); // eta(x+hat{Z}) -= U_Z(x)^dagger(x) (1+gamma_Z) phi(x) + lift back
-    pbp_su3_Z_PRECISION( buf4, leta+12*k ); // eta(x) -= U_Z(x) (1-gamma_Z) phi(x+hat{Z}) + lift back
-  }
-  ind = index[Y]; // Y direction
-  for ( i=0; i<length[Y]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+Y]; D_pt = D + 36*k + 9*Y;
-    prn_Y_PRECISION( buf1, lphi+12*k ); // (1+gamma_Y) phi(x) + projection
-    prp_Y_PRECISION( buf2, lphi+12*j ); // (1-gamma_Y) phi(x+hat{Y}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_Y^dagger(x) (1+gamma_Y) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_Y(x) (1-gamma_Y) phi(x+hat{Y}) - projected
-    pbn_su3_Y_PRECISION( buf3, leta+12*j ); // eta(x+hat{Y}) -= U_Y(x)^dagger(x) (1+gamma_Y) phi(x) + lift back
-    pbp_su3_Y_PRECISION( buf4, leta+12*k ); // eta(x) -= U_Y(x) (1-gamma_Y) phi(x+hat{Y}) + lift back
-  }
-  ind = index[X]; // X direction
-  for ( i=0; i<length[X]; i++ ) {
-    k = ind[i]; j = neighbor[4*k+X]; D_pt = D + 36*k + 9*X;
-    prn_X_PRECISION( buf1, lphi+12*k ); // (1+gamma_X) phi(x) + projection
-    prp_X_PRECISION( buf2, lphi+12*j ); // (1-gamma_X) phi(x+hat{X}) + projection
-    mvmh_PRECISION( buf3, D_pt, buf1 );     // U_X^dagger(x) (1+gamma_X) phi(x) - projected
-    mvmh_PRECISION( buf3+3, D_pt, buf1+3 ); // U_X^dagger(x) (1+gamma_X) phi(x) - projected
-    mvm_PRECISION( buf4, D_pt, buf2 );     // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
-    mvm_PRECISION( buf4+3, D_pt, buf2+3 ); // U_mu(x) (1-gamma_X) phi(x+hat{X}) - projected
-    pbn_su3_X_PRECISION( buf3, leta+12*j ); // eta(x+hat{X}) -= U_X(x)^dagger(x) (1+gamma_X) phi(x) + lift back
-    pbp_su3_X_PRECISION( buf4, leta+12*k ); // eta(x) -= U_X(x) (1-gamma_X) phi(x+hat{X}) + lift back
-  }
-#endif
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-
-#if defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) || defined(OPTIMIZED_SELF_COUPLING_PRECISION)
-void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
-                              level_struct *l, struct Thread *threading ) {
-  
-  int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end;
-#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  int i, j, *nb_pt;
-  complex_PRECISION pbuf[6];
-  vector_PRECISION phi_pt, eta_pt, end_pt;
-  config_PRECISION D_pt;
-#endif
-  
-  compute_core_start_end(0, 12*n, &start, &end, l, threading );
-  
-  SYNC_MASTER_TO_ALL(threading)
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-  sse_clover_PRECISION(eta, phi, op, start, end, l, threading );
-#else
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  config_PRECISION clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42;
-  clover_PRECISION( leta, lphi, clover, end-start, l, threading ); 
-#ifdef HAVE_TM
-  add_diagonal_PRECISION( leta, lphi, op->tm_term+start, end-start );
-#endif
-#endif
-  
-  START_MASTER(threading)
-  PROF_PRECISION_START( _NC ); 
-  END_MASTER(threading)
-  
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX };
-  prp_PRECISION( prn, phi, start, end );
-#else
-  for ( i=start/2, phi_pt=phi+start; i<end/2; i+=6, phi_pt+=12 ) {
-    prp_T_PRECISION( op->prnT+i, phi_pt );
-    prp_Z_PRECISION( op->prnZ+i, phi_pt );
-    prp_Y_PRECISION( op->prnY+i, phi_pt );
-    prp_X_PRECISION( op->prnX+i, phi_pt );
-  }
-#endif
-  // start communication in negative direction
-  START_LOCKED_MASTER(threading)
-  ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
-  ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
-  ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
-  ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
-  END_LOCKED_MASTER(threading) 
-  
-  // project plus dir and multiply with U dagger
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX };
-  prn_su3_PRECISION( prp, phi, op, neighbor, start, end );
-#else
-  for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_pt<end_pt; phi_pt+=12 ) {
-    // T dir
-    j = 6*(*nb_pt); nb_pt++;
-    prn_T_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpT+j, D_pt, pbuf );
-    mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); D_pt += 9;
-    // Z dir
-    j = 6*(*nb_pt); nb_pt++;
-    prn_Z_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpZ+j, D_pt, pbuf );
-    mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); D_pt += 9;
-    // Y dir
-    j = 6*(*nb_pt); nb_pt++;
-    prn_Y_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpY+j, D_pt, pbuf );
-    mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); D_pt += 9;
-    // X dir
-    j = 6*(*nb_pt); nb_pt++;
-    prn_X_PRECISION( pbuf, phi_pt );
-    mvmh_PRECISION( op->prpX+j, D_pt, pbuf );
-    mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9;
-  }
-#endif
-  
-  // start communication in positive direction
-  START_LOCKED_MASTER(threading)
-  ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
-  ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
-  ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
-  ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
-  // wait for communication in negative direction
-  ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l );
-  ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l );
-  ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l );
-  ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l );
-  END_LOCKED_MASTER(threading)
-  
-  // multiply with U and lift up minus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  su3_pbp_PRECISION( eta, prn, op, neighbor, start, end );
-#else
-  for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_pt<end_pt; eta_pt+=12 ) {
-    // T dir
-    j = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnT+j );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 );
-    pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9;
-    // Z dir
-    j = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnZ+j );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 );
-    pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9;
-    // Y dir
-    j = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnY+j );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 );
-    pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9;
-    // X dir
-    j = 6*(*nb_pt); nb_pt++;
-    mvm_PRECISION( pbuf, D_pt, op->prnX+j );
-    mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 );
-    pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9;
-  }
-#endif
-  
-  // wait for communication in positive direction
-  START_LOCKED_MASTER(threading)
-  ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l );
-  ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l );
-  ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l );
-  ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l );
-  END_LOCKED_MASTER(threading)
-  
-  // lift up plus dir
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  pbn_PRECISION( eta, prp, start, end );
-#else
-  for ( i=start/2, eta_pt=eta+start; i<end/2; i+=6, eta_pt+=12 ) {
-    pbn_su3_T_PRECISION( op->prpT+i, eta_pt );
-    pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt );
-    pbn_su3_Y_PRECISION( op->prpY+i, eta_pt );
-    pbn_su3_X_PRECISION( op->prpX+i, eta_pt );
-  }
-#endif
-  
-  START_MASTER(threading)
-  PROF_PRECISION_STOP( _NC, 1 );
-  END_MASTER(threading)
-  
-  SYNC_MASTER_TO_ALL(threading)
-}
-#endif
-
-
-#endif
-
diff --git a/src/sse_dirac_su3local.h b/src/sse_dirac_su3local.h
index 437eae6..8e1f8ad 100644
--- a/src/sse_dirac_su3local.h
+++ b/src/sse_dirac_su3local.h
@@ -19,6 +19,297 @@
  * 
  */
 
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) {
+#ifdef BOUNDARY  
+    for ( int i=start; i<end; i+=2 ) {
+#else
+    for ( int i=start; i<end; i++ ) {
+#endif
+        
+#ifdef MINUSDIR
+#define CURRENT_SIGN (+1)
+#define CURRENT_UPDATE _mm_add_ps
+#define CURRENT_MUL cmul_conj
+#define CURRENT_MADD cfmadd_conj
+#else
+#define CURRENT_SIGN (-1)
+#define CURRENT_UPDATE _mm_sub_ps
+#define CURRENT_MUL cmul
+#define CURRENT_MADD cfmadd
+#endif
+    
+      __m128 res11[2];
+      __m128 res21[2];
+      __m128 res12[2];
+      __m128 res22[2];
+      
+      {
+#ifdef BOUNDARY
+        float *pt = phi+48*ind[i+1];
+#else
+#ifdef MINUSDIR
+        float *pt = phi+48*ind[i];
+#else
+        float *pt = phi+48*neighbor[4*ind[i]+MU];
+#endif
+#endif
+        // calc spin0 flav1 projection
+        res11[0] = _mm_setr_ps( pt[0], pt[2], pt[4], 0 );
+        res11[1] = _mm_setr_ps( index_d_re(pt,MU,0), index_d_re(pt+2,MU,0), index_d_re(pt+4,MU,0), 0 );
+        res21[0] = _mm_setr_ps( pt[1], pt[3], pt[5], 0 );
+        res21[1] = _mm_setr_ps( index_d_im(pt,MU,0), index_d_im(pt+2,MU,0), index_d_im(pt+4,MU,0), 0 );
+        __m128 in11_re = CURRENT_UPDATE( res11[0], res11[1] );
+        __m128 in11_im = CURRENT_UPDATE( res21[0], res21[1] );
+        
+        // calc spin1 flav1 projection
+        res11[0] = _mm_setr_ps( pt[6], pt[8], pt[10], 0 );
+        res11[1] = _mm_setr_ps( index_d_re(pt,MU,1), index_d_re(pt+2,MU,1), index_d_re(pt+4,MU,1), 0 );
+        res21[0] = _mm_setr_ps( pt[7], pt[9], pt[11], 0 );
+        res21[1] = _mm_setr_ps( index_d_im(pt,MU,1), index_d_im(pt+2,MU,1), index_d_im(pt+4,MU,1), 0 );   
+        __m128 in21_re = CURRENT_UPDATE( res11[0], res11[1] );
+        __m128 in21_im = CURRENT_UPDATE( res21[0], res21[1] );
+        
+        // calc spin0 flav2 projection
+        res12[0] = _mm_setr_ps( pt[12], pt[14], pt[16], 0 );
+        res12[1] = _mm_setr_ps( index_d_re(pt+12,MU,0), index_d_re(pt+14,MU,0), index_d_re(pt+16,MU,0), 0 );
+        res22[0] = _mm_setr_ps( pt[13], pt[15], pt[17], 0 );
+        res22[1] = _mm_setr_ps( index_d_im(pt+12,MU,0), index_d_im(pt+14,MU,0), index_d_im(pt+16,MU,0), 0 );
+        __m128 in12_re = CURRENT_UPDATE( res12[0], res12[1] );
+        __m128 in12_im = CURRENT_UPDATE( res22[0], res22[1] );
+        
+        // calc spin1 flav2 projection
+        res12[0] = _mm_setr_ps( pt[18], pt[20], pt[22], 0 );
+        res12[1] = _mm_setr_ps( index_d_re(pt+12,MU,1), index_d_re(pt+14,MU,1), index_d_re(pt+16,MU,1), 0 );
+        res22[0] = _mm_setr_ps( pt[19], pt[21], pt[23], 0 );
+        res22[1] = _mm_setr_ps( index_d_im(pt+12,MU,1), index_d_im(pt+14,MU,1), index_d_im(pt+16,MU,1), 0 );
+        __m128 in22_re = CURRENT_UPDATE( res12[0], res12[1] );
+        __m128 in22_im = CURRENT_UPDATE( res22[0], res22[1] );
+        
+        { // perform su(3) matrix vector multiplication
+#ifdef BOUNDARY
+#ifdef MINUSDIR
+          pt = D + 96*ind[i+1] + 24*MU;
+#else
+          pt = D + 96*ind[i] + 24*MU;
+#endif
+#else
+          pt = D + 96*ind[i] + 24*MU;
+#endif
+          // load 1st part of su(3) matrix and multiply
+          {          
+            __m128 buf1 = _mm_loadu_ps( pt );
+            __m128 buf2 = _mm_loadu_ps( pt+SIMD_LENGTH_float );
+            {
+              __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) );
+              __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) );
+              CURRENT_MUL( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) );
+              __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) );
+              CURRENT_MUL( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) );
+              __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) );
+              CURRENT_MUL( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) );
+              __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) );
+              CURRENT_MUL( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
+            }
+          }
+          // load 2nd part of su(3) matrix and multiply
+          {
+            __m128 buf1 = _mm_loadu_ps( pt+2*SIMD_LENGTH_float );
+            __m128 buf2 = _mm_loadu_ps( pt+3*SIMD_LENGTH_float );
+            {
+              __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) );
+              __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) );
+              __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) );
+              __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) );
+              __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
+            }
+          }
+          // load 3rd part of su(3) matrix and multiply
+          {
+            __m128 buf1 = _mm_loadu_ps( pt+4*SIMD_LENGTH_float );
+            __m128 buf2 = _mm_loadu_ps( pt+5*SIMD_LENGTH_float );
+            {
+              __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) );
+              __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res11[0], &res11[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) );
+              __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res21[0], &res21[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) );
+              __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res12[0], &res12[1] );
+            }
+            {
+              __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) );
+              __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) );
+              CURRENT_MADD( buf1, buf2, buf3, buf4, &res22[0], &res22[1] );
+            }
+          }
+        }
+      }
+      
+      { // store result
+#ifdef BOUNDARY
+        float *pt = eta+48*ind[i];
+#else
+#ifdef MINUSDIR
+        float *pt = eta+48*neighbor[4*ind[i]+MU];
+#else
+        float *pt = eta+48*ind[i];
+#endif
+#endif
+        {
+          
+          // store spin0 flav1 contribution
+          __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] );
+          __m128 buf3 = _mm_loadu_ps( pt );
+          __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] );
+          __m128 buf4 = _mm_loadu_ps( pt+4 );
+          buf3 = UPD( buf3, buf1 );
+          buf4 = UPD( buf4, buf2 );
+          _mm_storeu_ps( pt, buf3 );
+          _mm_storeu_ps( pt+4, buf4 );
+        
+          { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+            __m128 *buf[2] = {&buf3,&buf4};
+            *buf[gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][0]] );
+            *buf[1-gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][0]] );
+            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][0]],   res11[gamma_offset[MU][0]] );
+            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][0]], res11[1-gamma_offset[MU][0]] );
+          }
+          
+          buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)   );
+          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+2 );
+          res11[0] = UPD( buf1, buf3 );
+          res11[1] = UPD( buf2, buf4 );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2),   res11[0] );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+2, res11[1] );
+          
+          // store spin1 contribution
+          buf3 = _mm_unpacklo_ps( res21[0], res21[1] );
+          buf1 = _mm_loadu_ps( pt+6 );
+          buf4 = _mm_unpackhi_ps( res21[0], res21[1] );
+          buf2 = _mm_loadu_ps( pt+10 );
+          buf1 = UPD( buf1, buf3 );
+          buf2 = UPD( buf2, buf4 );
+          _mm_storeu_ps( pt+6, buf1 );
+          _mm_storeu_ps( pt+10, buf2 );
+          
+          { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+            __m128 *buf[2] = {&buf3,&buf4};
+            *buf[gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][1]] );
+            *buf[1-gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][1]] );
+            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][1]], res21[gamma_offset[MU][1]] );
+            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][1]], res21[1-gamma_offset[MU][1]] );
+          }
+          
+          buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)   );
+          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+2 );
+          res21[0] = UPD( buf1, buf3 );
+          res21[1] = UPD( buf2, buf4 );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2),   res21[0] );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+2, res21[1] );
+        }
+        {
+          // store spin0 flav2 contribution
+          __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] );
+          __m128 buf3 = _mm_loadu_ps( pt+12 );
+          __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] );
+          __m128 buf4 = _mm_loadu_ps( pt+16 );
+          buf3 = UPD( buf3, buf1 );
+          buf4 = UPD( buf4, buf2 );
+          _mm_storeu_ps( pt+12, buf3 );
+          _mm_storeu_ps( pt+16, buf4 );
+          
+           { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+            __m128 *buf[2] = {&buf3,&buf4};
+            *buf[gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][0]] );
+            *buf[1-gamma_offset[MU][0]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][0]] );
+            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][0]], res12[gamma_offset[MU][0]] );
+            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][0]], res12[1-gamma_offset[MU][0]] );
+          }
+          
+          buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+12 );
+          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+14 );
+          res12[0] = UPD( buf1, buf3 );
+          res12[1] = UPD( buf2, buf4 );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+12, res12[0] );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][0]+12*(gamma_co[MU][0]/2)+14, res12[1] );
+          
+          // store spin1 contribution
+          buf3 = _mm_unpacklo_ps( res22[0], res22[1] );
+          buf1 = _mm_loadu_ps( pt+18 );
+          buf4 = _mm_unpackhi_ps( res22[0], res22[1] );
+          buf2 = _mm_loadu_ps( pt+22 );
+          buf1 = UPD( buf1, buf3 );
+          buf2 = UPD( buf2, buf4 );
+          _mm_storeu_ps( pt+18, buf1 );
+          _mm_storeu_ps( pt+22, buf2 );
+          
+          { // store contribution from 1st SU(3) multiplication to either spin2 or spin3
+            __m128 *buf[2] = {&buf3,&buf4};
+            *buf[gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_re_sign[MU][gamma_co[MU][1]] );
+            *buf[1-gamma_offset[MU][1]] = _mm_set1_ps( CURRENT_SIGN*gamma_im_sign[MU][gamma_co[MU][1]] );
+            buf1 = _mm_mul_ps( *buf[gamma_offset[MU][1]], res22[gamma_offset[MU][1]] );
+            buf2 = _mm_mul_ps( *buf[1-gamma_offset[MU][1]], res22[1-gamma_offset[MU][1]] );
+          }
+          
+          buf3 = _mm_unpacklo_ps( buf1, buf2 );
+          buf4 = _mm_unpackhi_ps( buf1, buf2 );
+          buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4)
+          buf1 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+12 );
+          buf2 = _mm_loadu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+14 );
+          res22[0] = UPD( buf1, buf3 );
+          res22[1] = UPD( buf2, buf4 );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+12, res22[0] );
+          _mm_storeu_ps( pt + 6*gamma_co[MU][1]+12*(gamma_co[MU][1]/2)+14, res22[1] );
+        }
+      }
+#undef CURRENT_SIGN
+#undef CURRENT_UPDATE
+#undef CURRENT_MUL
+#undef CURRENT_MADD
+#undef LINE1
+#undef LINE2
+#undef LINE3
+    }
+  }else
+#endif
 #ifdef BOUNDARY  
   for ( int i=start; i<end; i+=2 ) {
 #else
@@ -199,4 +490,4 @@
 #undef LINE1
 #undef LINE2
 #undef LINE3
-  }
\ No newline at end of file
+  }
diff --git a/src/sse_interpolation_generic.c b/src/sse_interpolation_generic.c
deleted file mode 100644
index 625c879..0000000
--- a/src/sse_interpolation_generic.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#if defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION )
-
-void interpolation_PRECISION_alloc( level_struct *l ) {
-  
-  int k, n = l->num_eig_vect;
-  
-  MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
-  MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n );
-  
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION  
-  MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n );
-  l->is_PRECISION.interpolation[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 128 );
-  for ( k=1; k<n; k++ )
-    l->is_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size;
-#endif
-  // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size
-  MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION,
-                    ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 );
-
-  l->is_PRECISION.test_vector[0] = NULL;
-  MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 );
-  for ( k=1; k<n; k++ ) {
-    l->is_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size;
-  }    
-}
-
-
-void interpolation_PRECISION_dummy_alloc( level_struct *l ) {
-  
-  MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect );
-  MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect );  
-}
-
-
-void interpolation_PRECISION_dummy_free( level_struct *l ) {
-  
-  FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect );
-  FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect );  
-}
-
-
-void interpolation_PRECISION_free( level_struct *l ) {
-  
-  int n = l->num_eig_vect;
-  
-  FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size );
-  FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n );
-  FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n );
-#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION  
-  FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size );
-  FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n );
-#endif
-  FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size );
-}
-
-
-void swap8_PRECISION( PRECISION* data ) {
-  
-  int i;
-  PRECISION tmp[8];
-  
-  for ( i=0; i<4; i++ ) {
-    tmp[i] = data[2*i];
-    tmp[i+4] = data[2*i+1];
-  }
-  
-  for ( i=0; i<8; i++ ) {
-    data[i] = tmp[i];
-  }
-}
-
-
-void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) {
-  
-  int j, num_eig_vect = l->num_eig_vect;
-  complex_PRECISION *operator = l->is_PRECISION.operator;
-
-  int start = threading->start_index[l->depth];
-  int end = threading->end_index[l->depth];
-      
-  SYNC_CORES(threading)
-  int offset = SIMD_LENGTH_PRECISION;
-  for ( j=0; j<num_eig_vect; j+=offset ) {
-    int j_end = j+offset;
-    if(j_end > num_eig_vect)
-      j_end = num_eig_vect;
-    
-    operator = l->is_PRECISION.operator + j*l->vector_size + start*offset;
-    
-    for ( int i=start; i<end; i+=offset/2 ) {
-      __m128 data[offset];
-      for ( int j2=j; j2<j_end; j2++ )
-        data[j2-j] = _mm_load_ps((float *)(interpolation[j2]+i));
-      for ( int j2=j_end; j2<j+offset; j2++ )
-        data[j2-j] = _mm_setzero_ps();
-      
-      transpose_4_registers(data);
-
-      for ( int k=0; k<offset; k++) {
-        _mm_store_ps((float *)operator, data[k]);
-        // operator type is complex, so offset only by SIMD_LENGTH_PRECISION over *two*
-        operator += offset/2;
-      }
-    }
-  }
-  SYNC_CORES(threading)
-}
-
-
-void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-      num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-                    
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-  
-  for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-    phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-    phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-    float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-    float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-    __m128 zero =  _mm_setzero_ps();
-    for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
-      _mm_store_ps(tmp_phi_c_re+j, zero);
-      _mm_store_ps(tmp_phi_c_im+j, zero);
-    }
-    // copy phi_c into temporary
-    for ( j=0; j<num_eig_vect; j++ ) {
-      tmp_phi_c_re[j] = creal(phi_c_pt[j]);
-      tmp_phi_c_im[j] = cimag(phi_c_pt[j]);
-    }
-    for ( j=0; j<num_eig_vect; j++ ) {
-      tmp_phi_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+num_eig_vect]);
-      tmp_phi_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+num_eig_vect]);
-    }
-
-    int offset = SIMD_LENGTH_PRECISION;
-    // loop over blocks of SIMD_LENGTH_PRECISION vectors
-    for ( j=0; j<num_eig_vect; j+=offset ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
-
-      for ( k=0; k<aggregate_sites; k++ ) {
-        // offset used for 2 components of gamma5-symmetry stuff
-        int low_high_offset = 0;
-        for ( k1=0; k1<2; k1++ ) {
-          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-            __m128 phi_re = _mm_setzero_ps();
-            __m128 phi_im = _mm_setzero_ps();
-            
-            __m128 operator_re = _mm_load_ps((float *)operator);
-            __m128 operator_im = _mm_load_ps((float *)operator+offset);
-            __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+j+low_high_offset);
-            __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+j+low_high_offset);
-
-            cfmadd(operator_re, operator_im, phi_c_re, phi_c_im, &phi_re, &phi_im);
-
-            // skip to next real line of matrix
-            operator += offset;
-            // horizontal sum for phi
-            __m128 tmp;
-            tmp = _mm_add_ps( phi_re, _mm_movehl_ps( phi_re, phi_re ) );
-            phi_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-            
-            tmp = _mm_add_ps( phi_im, _mm_movehl_ps( phi_im, phi_im ) );
-            phi_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-            
-            __m128 tmp1; tmp1 = _mm_set1_ps(((float *)phi_pt)[0]);
-            __m128 tmp2; tmp2 = _mm_set1_ps(((float *)phi_pt)[1]);
-            phi_re = _mm_add_ps(phi_re, tmp1);
-            phi_im = _mm_add_ps(phi_im, tmp2);
-            _mm_store_ss( (float*)phi_pt, phi_re );
-            _mm_store_ss( ((float*)phi_pt)+1, phi_im );
-            phi_pt++;
-          }
-          low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-        }
-      }
-    }
-  }
-    
-  PROF_PRECISION_STOP( _PR, 1, threading );
-
-  SYNC_HYPERTHREADS(threading)
-}
-
-
-
-void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ) {
-  
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-      num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-  
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level );
-  END_LOCKED_MASTER(threading)
-  SYNC_HYPERTHREADS(threading)
-  
-  for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-    phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-    phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-    float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float];
-    float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float];
-    __m128 zero =  _mm_setzero_ps();
-    for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) {
-      _mm_store_ps(tmp_phi_c_re+j, zero);
-      _mm_store_ps(tmp_phi_c_im+j, zero);
-    }
-    // copy phi_c into temporary
-    for ( j=0; j<num_eig_vect; j++ ) {
-      tmp_phi_c_re[j] = creal(phi_c_pt[j]);
-      tmp_phi_c_im[j] = cimag(phi_c_pt[j]);
-    }
-    for ( j=0; j<num_eig_vect; j++ ) {
-      tmp_phi_c_re[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = creal(phi_c_pt[j+num_eig_vect]);
-      tmp_phi_c_im[j+OPERATOR_COMPONENT_OFFSET_PRECISION] = cimag(phi_c_pt[j+num_eig_vect]);
-    }
-
-    int offset = SIMD_LENGTH_PRECISION;
-    // loop over blocks of SIMD_LENGTH_PRECISION vectors
-    for ( j=0; j<num_eig_vect; j+=offset ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
-
-      for ( k=0; k<aggregate_sites; k++ ) {
-        // offset used for 2 components of gamma5-symmetry stuff
-        int low_high_offset = 0;
-        for ( k1=0; k1<2; k1++ ) {
-          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-            __m128 phi_re = _mm_setzero_ps();
-            __m128 phi_im = _mm_setzero_ps();
-
-            __m128 operator_re = _mm_load_ps((float *)operator);
-            __m128 operator_im = _mm_load_ps((float *)operator+offset);
-            __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+j+low_high_offset);
-            __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+j+low_high_offset);
-
-            cfmadd(operator_re, operator_im, phi_c_re, phi_c_im, &phi_re, &phi_im);
-
-            // skip to next real line of matrix
-            operator += offset;
-            // horizontal sum for phi            
-            __m128 tmp;
-            tmp = _mm_add_ps( phi_re, _mm_movehl_ps( phi_re, phi_re ) );
-            phi_re = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-            
-            tmp = _mm_add_ps( phi_im, _mm_movehl_ps( phi_im, phi_im ) );
-            phi_im = _mm_add_ss( tmp, _mm_shuffle_ps( tmp, tmp, 1 ) );
-            
-            if ( j!= 0 ) {
-              __m128 tmp1; tmp1 = _mm_set1_ps(((float *)phi_pt)[0]);
-              __m128 tmp2; tmp2 = _mm_set1_ps(((float *)phi_pt)[1]);
-              phi_re = _mm_add_ps(phi_re, tmp1);
-              phi_im = _mm_add_ps(phi_im, tmp2);
-            }
-            _mm_store_ss( (float*)phi_pt, phi_re );
-            _mm_store_ss( ((float*)phi_pt)+1, phi_im );
-            phi_pt++;
-          }
-          low_high_offset = OPERATOR_COMPONENT_OFFSET_PRECISION;
-        }
-      }
-    }
-  }
-    
-  PROF_PRECISION_STOP( _PR, 1, threading );
-
-  SYNC_HYPERTHREADS(threading)
-}
-
-
-void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading ) {
-  
-  SYNC_CORES(threading)
-  SYNC_HYPERTHREADS(threading)
-
-  PROF_PRECISION_START( _PR, threading );
-  int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect,
-      num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates;
-  complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi,
-                    *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer;
-
-  for ( i=threading->n_thread*threading->core + threading->thread; i<num_aggregates; i+=threading->n_core*threading->n_thread ) {
-    
-    phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-    phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-
-    int offset = SIMD_LENGTH_PRECISION;
-    // loop over blocks of SIMD_LENGTH_PRECISION vectors
-    for ( j=0; j<num_eig_vect; j+=offset ) {
-      phi_pt   = phi + i*2*num_parent_eig_vect*aggregate_sites;
-      phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect;
-      operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites;
-
-      // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving
-      // complex components and masking
-      // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator)
-      float tmp_phi_c_re[2*offset];
-      float tmp_phi_c_im[2*offset];
-      __m128 zero =  _mm_setzero_ps();
-      for ( k1=0; k1<2*offset; k1+=offset ) {
-        _mm_store_ps(tmp_phi_c_re+k1, zero);
-        _mm_store_ps(tmp_phi_c_im+k1, zero);
-      }
-
-      for ( k=0; k<aggregate_sites; k++ ) {
-        // offset used for 2 components of gamma5-symmetry stuff
-        int low_high_offset = 0;
-        for ( k1=0; k1<2; k1++ ) {
-          for ( k2=0; k2<num_parent_eig_vect; k2++ ) {
-            // phi is the same for all eigenvectors -> broadcast
-            __m128 phi_re = _mm_set1_ps(((float *)phi_pt)[0]);
-            __m128 phi_im = _mm_set1_ps(((float *)phi_pt)[1]);
-
-            __m128 operator_re = _mm_load_ps((float *)operator);
-            __m128 operator_im = _mm_load_ps((float *)operator+offset);
-            __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+low_high_offset);
-            __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+low_high_offset);
-
-            cfmadd_conj(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im);
-
-            _mm_store_ps(tmp_phi_c_re+low_high_offset, phi_c_re);
-            _mm_store_ps(tmp_phi_c_im+low_high_offset, phi_c_im);
-            // skip to next real line of matrix
-            operator += offset;
-            phi_pt++;
-          }
-          low_high_offset = offset;
-        }
-      }
-
-      for ( int m=0; m<offset; m++ ) {
-        if ( m+j >= num_eig_vect ) break;
-        ((float*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m];
-        ((float*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m];
-      }
-      
-      for ( int m=0; m<offset; m++ ) {
-        if ( m+j >= num_eig_vect ) break;
-        ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset];
-        ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset];
-      }
-    }
-  }
-  
-  SYNC_HYPERTHREADS(threading)
-  START_LOCKED_MASTER(threading)
-  vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level );
-  END_LOCKED_MASTER(threading)
-  PROF_PRECISION_STOP( _PR, 1, threading );
-}
-
-#endif // defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION )
diff --git a/src/sse_interpolation_generic.h b/src/sse_interpolation_generic.h
deleted file mode 100644
index 2db7a86..0000000
--- a/src/sse_interpolation_generic.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_INTERPOLATION_PRECISION_HEADER
-  #define SSE_INTERPOLATION_PRECISION_HEADER  
-  
-  #ifdef SSE
-  void interpolation_PRECISION_alloc( level_struct *l );
-  void interpolation_PRECISION_free( level_struct *l );
-  void interpolation_PRECISION_dummy_alloc( level_struct *l );
-  void interpolation_PRECISION_dummy_free( level_struct *l );
-  
-  void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading );
-  void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading );
-  void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading );
-#endif
-  
-#endif
\ No newline at end of file
diff --git a/src/sse_linalg.c b/src/sse_linalg.c
deleted file mode 100644
index bf0f9d6..0000000
--- a/src/sse_linalg.c
+++ /dev/null
@@ -1,795 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#ifdef OPTIMIZED_LINALG_double
-void vector_double_scale( vector_double z, vector_double x, complex_double alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-  PROF_double_START( _LA6 );
-  
-  __m128d alpha_re = _mm_set1_pd( creal_double(alpha) );
-  __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) );
-  double *zd = (double*)(z+start);
-  double *xd = (double*)(x+start);
-  
-  for( int i=start; i<end; ) {
-    FOR6(
-      {
-        __m128d z_re; __m128d z_im;
-        __m128d x_re; __m128d x_im;
-        sse_complex_deinterleaved_load_pd( xd, &x_re, &x_im );
-        
-        cmul_pd( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
-        
-        sse_complex_interleaved_store_pd( z_re, z_im, zd );
-        zd += SIMD_LENGTH_double*2;
-        xd += SIMD_LENGTH_double*2;
-        i += SIMD_LENGTH_double;
-      }
-    )
-  }
-  
-  if(thread == 0 && start != end)
-  PROF_double_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void vector_float_scale( vector_float z, vector_float x, complex_float alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if(thread == 0 && start != end)
-  PROF_float_START( _LA6 );
-  
-  __m128 alpha_re = _mm_set1_ps( creal_float(alpha) );
-  __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) );
-  float *zf = (float*)(z+start);
-  float *xf = (float*)(x+start);
-  
-  if ( l->depth == 0 ) {
-    for( int i=start; i<end; ) {
-      FOR3(
-        {
-          __m128 z_re; __m128 z_im;
-          __m128 x_re; __m128 x_im;
-          sse_complex_deinterleaved_load( xf, &x_re, &x_im );
-          
-          cmul( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
-          
-          sse_complex_interleaved_store( z_re, z_im, zf );
-          zf += SIMD_LENGTH_float*2;
-          xf += SIMD_LENGTH_float*2;
-          i += SIMD_LENGTH_float;
-        }
-      )
-    }
-  } else {
-    for( int i=start; i<end; ) {
-      __m128 z_re; __m128 z_im;
-      __m128 x_re; __m128 x_im;
-      sse_complex_deinterleaved_load( xf, &x_re, &x_im );
-      
-      cmul( alpha_re, alpha_im, x_re, x_im, &z_re, &z_im );
-      
-      sse_complex_interleaved_store( z_re, z_im, zf );
-      zf += SIMD_LENGTH_float*2;
-      xf += SIMD_LENGTH_float*2;
-      i += SIMD_LENGTH_float;
-    }
-  }
-  
-  if(thread == 0 && start != end)
-  PROF_float_STOP( _LA6, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void vector_float_saxpy( vector_float z, vector_float x, vector_float y, complex_float alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_float_START( _LA8 );
-  
-  __m128 alpha_re = _mm_set1_ps( creal_float(alpha) );
-  __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) );
-  
-  if ( l->depth == 0 ) {
-    for ( int i=start; i<end; ) {
-      FOR3(
-        {
-          __m128 x_re; __m128 x_im; __m128 y_re; __m128 y_im;
-          sse_complex_deinterleaved_load( (float*)(x+i), &x_re, &x_im );
-          sse_complex_deinterleaved_load( (float*)(y+i), &y_re, &y_im );
-          cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
-          sse_complex_interleaved_store( x_re, x_im, (float*)(z+i) );
-          i+=SIMD_LENGTH_float;
-        }
-      )
-    }
-  } else {
-    for ( int i=start; i<end; ) {
-      __m128 x_re; __m128 x_im; __m128 y_re; __m128 y_im;
-      sse_complex_deinterleaved_load( (float*)(x+i), &x_re, &x_im );
-      sse_complex_deinterleaved_load( (float*)(y+i), &y_re, &y_im );
-      cfmadd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
-      sse_complex_interleaved_store( x_re, x_im, (float*)(z+i) );
-      i+=SIMD_LENGTH_float;
-    }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_float_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-void vector_double_saxpy( vector_double z, vector_double x, vector_double y, complex_double alpha, int start, int end, level_struct *l ) {
-  
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_double_START( _LA8 );
-  
-  __m128d alpha_re = _mm_set1_pd( creal_double(alpha) );
-  __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) );
-  
-  for ( int i=start; i<end; ) {
-    FOR6(
-      {
-        __m128d x_re; __m128d x_im; __m128d y_re; __m128d y_im;
-        sse_complex_deinterleaved_load_pd( (double*)(x+i), &x_re, &x_im );
-        sse_complex_deinterleaved_load_pd( (double*)(y+i), &y_re, &y_im );
-        cfmadd_pd(alpha_re, alpha_im, y_re, y_im, &x_re, &x_im);
-        sse_complex_interleaved_store_pd( x_re, x_im, (double*)(z+i) );
-        i+=SIMD_LENGTH_double;
-      }
-    )
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_double_STOP( _LA8, (double)(end-start)/(double)l->inner_vector_size );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-complex_double global_inner_product_double( vector_double phi, vector_double psi, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_double_START( _GIP, threading );
-  complex_double local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  __m128d alpha_re = _mm_setzero_pd();
-  __m128d alpha_im = _mm_setzero_pd();
-  
-  if ( l->depth == 0 ) {
-    for( int i=thread_start; i<thread_end; ) {
-      FOR3(
-        {
-          __m128d phi_re; __m128d phi_im;
-          __m128d psi_re; __m128d psi_im;
-          sse_complex_deinterleaved_load_pd( (double*)(phi+i), &phi_re, &phi_im );
-          sse_complex_deinterleaved_load_pd( (double*)(psi+i), &psi_re, &psi_im );
-          cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-          i+=SIMD_LENGTH_double;
-        }
-      )
-    }
-  } else {
-    for( int i=thread_start; i<thread_end; ) {
-      __m128d phi_re; __m128d phi_im;
-      __m128d psi_re; __m128d psi_im;
-      sse_complex_deinterleaved_load_pd( (double*)(phi+i), &phi_re, &phi_im );
-      sse_complex_deinterleaved_load_pd( (double*)(psi+i), &psi_re, &psi_im );
-      cfmadd_conj_pd( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-      i+=SIMD_LENGTH_double;
-    }
-  }
-  
-  local_alpha = sse_reduce_add_pd( alpha_re ) + I* sse_reduce_add_pd( alpha_im );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((complex_double *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((complex_double *)threading->workspace)[0] += ((complex_double *)threading->workspace)[i];
-  local_alpha = ((complex_double *)threading->workspace)[0];
-  END_MASTER(threading)
-  
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_double_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm );
-    PROF_double_STOP( _ALLR, 1 );
-    ((complex_double *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((complex_double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return global_alpha;
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((complex_double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return local_alpha;
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-complex_float global_inner_product_float( vector_float phi, vector_float psi, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_float_START( _GIP, threading );
-  complex_float local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  __m128 alpha_re = _mm_setzero_ps();
-  __m128 alpha_im = _mm_setzero_ps();
-  
-  float *phif = (float*)(phi+thread_start);
-  float *psif = (float*)(psi+thread_start);
-  
-  if ( l->depth == 0 ) {
-    for( int i=thread_start; i<thread_end; ) {
-      FOR3(
-        {
-          __m128 phi_re; __m128 phi_im;
-          __m128 psi_re; __m128 psi_im;
-          sse_complex_deinterleaved_load( phif, &phi_re, &phi_im );
-          sse_complex_deinterleaved_load( psif, &psi_re, &psi_im );
-          cfmadd_conj( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-          phif += 8;
-          psif += 8;
-          i+=SIMD_LENGTH_float;
-        }
-      )
-    }
-  } else {
-    for( int i=thread_start; i<thread_end; ) {
-      __m128 phi_re; __m128 phi_im;
-      __m128 psi_re; __m128 psi_im;
-      sse_complex_deinterleaved_load( phif, &phi_re, &phi_im );
-      sse_complex_deinterleaved_load( psif, &psi_re, &psi_im );
-      cfmadd_conj( phi_re, phi_im, psi_re, psi_im, &alpha_re, &alpha_im );
-      phif += 8;
-      psif += 8;
-      i+=SIMD_LENGTH_float;
-    }
-  }
-  
-  local_alpha = sse_reduce_add_ps( alpha_re ) + I* sse_reduce_add_ps( alpha_im );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((complex_float *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((complex_float *)threading->workspace)[0] += ((complex_float *)threading->workspace)[i];
-  local_alpha = ((complex_float *)threading->workspace)[0];
-  END_MASTER(threading)
-  
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_float_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm );
-    PROF_float_STOP( _ALLR, 1 );
-    ((complex_float *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((complex_float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return global_alpha;
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((complex_float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return local_alpha;
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-double global_norm_double( vector_double x, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_double_START( _GIP, threading );
-  
-  double local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  VECTOR_FOR( int i=thread_start, i<thread_end, local_alpha += NORM_SQUARE_double(x[i]), i++, l );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((double *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((double *)threading->workspace)[0] += ((double *)threading->workspace)[i];
-  local_alpha = ((double *)threading->workspace)[0];
-  END_MASTER(threading)
-
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_double_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm );
-    PROF_double_STOP( _ALLR, 1 );
-    ((double *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (double)sqrt((double)global_alpha);
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((double *)threading->workspace)[0];
-    PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (double)sqrt((double)local_alpha);
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-float global_norm_float( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) {
-  
-  PROF_float_START( _GIP, threading );
-  
-  float local_alpha = 0, global_alpha = 0;
-
-  int thread_start;
-  int thread_end;
-  compute_core_start_end(start, end, &thread_start, &thread_end, l, threading);
-  
-  SYNC_CORES(threading)
-  
-  __m128 alpha = _mm_setzero_ps(); 
-  
-  if ( l->depth == 0 ) {       
-    for( int i=thread_start; i<thread_end; ) {
-      FOR6(
-        {
-          __m128 phi = _mm_loadu_ps((float*)(x+i));
-          alpha = sse_fmadd( phi, phi, alpha );
-          i += 2;
-        }
-      )
-    }
-  } else {
-    for( int i=thread_start; i<thread_end; ) {
-      __m128 phi = _mm_loadu_ps((float*)(x+i));
-      alpha = sse_fmadd( phi, phi, alpha );
-      i += 2;
-    }
-  }
-  
-  local_alpha = sse_reduce_add_ps( alpha );
-
-  // sum over cores
-  START_NO_HYPERTHREADS(threading)
-  ((float *)threading->workspace)[threading->core] = local_alpha;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++)
-    ((float *)threading->workspace)[0] += ((float *)threading->workspace)[i];
-  local_alpha = ((float *)threading->workspace)[0];
-  END_MASTER(threading)
-
-  if ( g.num_processes > 1 ) {
-    START_MASTER(threading)
-    PROF_float_START( _ALLR );
-    MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm );
-    PROF_float_STOP( _ALLR, 1 );
-    ((float *)threading->workspace)[0] = global_alpha;
-    END_MASTER(threading)
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    global_alpha = ((float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (float)sqrt((double)global_alpha);
-  } else {
-    // all threads need the result of the norm
-    SYNC_MASTER_TO_ALL(threading)
-    local_alpha = ((float *)threading->workspace)[0];
-    PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-    return (float)sqrt((double)local_alpha);
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha,
-                                int sign, int count, int start, int end, level_struct *l ) {
-
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_double_START( _LA8 );
-  
-  int flag = 0;
-  __m128d alpha_re[count]; __m128d alpha_im[count];
-  for ( int c=0; c<count; c++ ) {
-    alpha_re[c] = _mm_set1_pd( sign*creal_double(alpha[c]) );
-    alpha_im[c] = _mm_set1_pd( sign*cimag_double(alpha[c]) );
-    if ( cimag_double(alpha[c]) > EPS_double || -cimag_double(alpha[c]) > EPS_double )
-      flag = 1;
-  }
-  
-  if ( flag == 0 ) {
-    for ( int c=0; c<count; c++ ) {
-      for ( int i=start; i<end; ) {
-        FOR12(
-          {
-            __m128d z_re = _mm_loadu_pd( (double*)(z+i) );
-            __m128d V_re = _mm_loadu_pd( (double*)(V[c]+i) );
-            z_re = sse_fmadd_pd( alpha_re[c], V_re, z_re );
-            _mm_storeu_pd( (double*)(z+i), z_re );
-            i++;
-          }
-        )
-      }
-    }
-  } else {
-    for ( int c=0; c<count; c++ ) {
-      for ( int i=start; i<end; ) {
-        FOR6(
-          {
-            __m128d z_re; __m128d z_im; __m128d V_re; __m128d V_im; 
-            sse_complex_deinterleaved_load_pd( (double*)(z+i), &z_re, &z_im );
-            sse_complex_deinterleaved_load_pd( (double*)(V[c]+i), &V_re, &V_im );
-            cfmadd_pd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im);
-            sse_complex_interleaved_store_pd( z_re, z_im, (double*)(z+i) );
-            i += SIMD_LENGTH_double;
-          }
-        )
-      }
-    }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_double_STOP( _LA8, (double)(count) );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha,
-                               int sign, int count, int start, int end, level_struct *l ) {
-
-  __m128 V_re; __m128 V_im;
-  __m128 z_re; __m128 z_im;
-  __m128 alpha_re[count]; __m128 alpha_im[count];
-  int flag = 0;
-  
-  int thread = omp_get_thread_num();
-  if (thread == 0 && start != end )
-  PROF_float_START( _LA8 );
-  
-  for ( int c=0; c<count; c++ ) {
-    alpha_re[c] = _mm_set1_ps( sign*creal_float(alpha[c]) );
-    alpha_im[c] = _mm_set1_ps( sign*cimag_float(alpha[c]) );
-    if ( cimag_float(alpha[c]) > EPS_float || -cimag_float(alpha[c]) > EPS_float )
-      flag = 1;
-  }
-  
-  if ( l->depth == 0 ) {
-    if ( flag == 0 ) {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          FOR6(
-            {
-              z_re = _mm_loadu_ps( (float*)(z+i) );
-              V_re = _mm_loadu_ps( (float*)(V[c]+i) );
-              z_re = sse_fmadd( alpha_re[c], V_re, z_re );
-              _mm_storeu_ps( (float*)(z+i), z_re );
-              i+=2;
-            }
-          )
-        }
-      }
-    } else {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          FOR3(
-            {
-              sse_complex_deinterleaved_load( (float*)(z+i), &z_re, &z_im );
-              sse_complex_deinterleaved_load( (float*)(V[c]+i), &V_re, &V_im );
-              cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im);
-              sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) );
-              i+=SIMD_LENGTH_float;
-            }
-          )
-        }
-      }
-    }
-  } else {
-    if ( flag == 0 ) {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          z_re = _mm_loadu_ps( (float*)(z+i) );
-          V_re = _mm_loadu_ps( (float*)(V[c]+i) );
-          z_re = sse_fmadd( alpha_re[c], V_re, z_re );
-          _mm_storeu_ps( (float*)(z+i), z_re );
-          i+=2;
-        }
-      }
-    } else {
-      for ( int c=0; c<count; c++ ) {
-        for ( int i=start; i<end; ) {
-          sse_complex_deinterleaved_load( (float*)(z+i), &z_re, &z_im );
-          sse_complex_deinterleaved_load( (float*)(V[c]+i), &V_re, &V_im );
-          cfmadd(alpha_re[c], alpha_im[c], V_re, V_im, &z_re, &z_im);
-          sse_complex_interleaved_store( z_re, z_im, (float*)(z+i) );
-          i+=SIMD_LENGTH_float;
-        }
-      }
-    }
-  }
-  
-  if( thread == 0 && start != end )
-  PROF_float_STOP( _LA8, (double)(count) );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi,
-                                     vector_float psi, int start, int end, level_struct *l,
-                                     struct Thread *threading ) {
-
-  PROF_float_START( _PIP, threading );
-  int i;
-  for(int c=0; c<count; c++)
-    results[c] = 0.0;
-
-  int thread_start;
-  int thread_end;
-
-  SYNC_CORES(threading)
-  
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
-  
-  __m128 result_re = _mm_setzero_ps();
-  __m128 result_im = _mm_setzero_ps();
-  
-  for( int c=0; c<count; c++) {
-    for ( i=thread_start; i<thread_end; i+=12 ) {
-      __m128 psi_re; __m128 psi_im;
-      __m128 phi_re; __m128 phi_im;
-      // deinterleave complex numbers into 4 real parts and 4 imag parts
-      sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im );
-      sse_complex_deinterleaved_load( (float*)(phi[c]+i), &phi_re, &phi_im );
-      
-      cmul_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      
-      sse_complex_deinterleaved_load( (float*)(psi+i+4), &psi_re, &psi_im );
-      sse_complex_deinterleaved_load( (float*)(phi[c]+i+4), &phi_re, &phi_im );
-      
-      cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      
-      sse_complex_deinterleaved_load( (float*)(psi+i+8), &psi_re, &psi_im );
-      sse_complex_deinterleaved_load( (float*)(phi[c]+i+8), &phi_re, &phi_im );
-      
-      cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      
-      results[c] += sse_reduce_add_ps(result_re) + I* sse_reduce_add_ps(result_im);
-    }
-  }
-
-  START_NO_HYPERTHREADS(threading)
-  ((complex_double **)threading->workspace)[threading->core] = results;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int c=0; c<count; c++)
-    for(int i=1; i<threading->n_core; i++)
-      ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c];
-  END_MASTER(threading)
-  // all threads need the result of the norm
-  SYNC_MASTER_TO_ALL(threading)
-  for(int c=0; c<count; c++)
-    results[c] = ((complex_double **)threading->workspace)[0][c];
-
-  PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_float
-void process_multi_inner_product_float( int count, complex_float *results, vector_float *phi, vector_float psi,
-    int start, int end, level_struct *l, struct Thread *threading ) {
-
-  PROF_float_START( _PIP, threading );
-  int i;
-  for(int c=0; c<count; c++)
-    results[c] = 0.0;
-
-  int thread_start;
-  int thread_end;
-
-  SYNC_CORES(threading)
-  
-  if ( l->depth == 0 ) {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
-    for(int c=0; c<count; c++) {
-      __m128 result_re = _mm_setzero_ps();
-      __m128 result_im = _mm_setzero_ps();
-      for ( i=thread_start; i<thread_end; ) {
-        FOR3(
-          {
-            __m128 phi_re; __m128 phi_im;
-            __m128 psi_re; __m128 psi_im;
-            
-            // deinterleave complex numbers into 4 real parts and 4 imag parts        
-            sse_complex_deinterleaved_load( (float*)(phi[c]+i), &phi_re, &phi_im );
-            sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im );
-
-            cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-            i+=SIMD_LENGTH_float;
-          }
-        )
-      }
-      results[c] += sse_reduce_add_ps(result_re) + I*sse_reduce_add_ps(result_im);
-    }
-  } else {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 4);
-    for(int c=0; c<count; c++) {
-      __m128 result_re = _mm_setzero_ps();
-      __m128 result_im = _mm_setzero_ps();
-      for ( i=thread_start; i<thread_end; i+=SIMD_LENGTH_float ) {
-        __m128 phi_re; __m128 phi_im;
-        __m128 psi_re; __m128 psi_im;
-        
-        // deinterleave complex numbers into 4 real parts and 4 imag parts        
-        sse_complex_deinterleaved_load( (float*)(phi[c]+i), &phi_re, &phi_im );
-        sse_complex_deinterleaved_load( (float*)(psi+i), &psi_re, &psi_im );
-
-        cfmadd_conj(phi_re, phi_im, psi_re, psi_im, &result_re, &result_im);
-      }
-      results[c] += sse_reduce_add_ps(result_re) + I*sse_reduce_add_ps(result_im);
-    }
-  }
-  
-  START_NO_HYPERTHREADS(threading)
-  ((complex_float **)threading->workspace)[threading->core] = results;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int c=0; c<count; c++)
-    for(int i=1; i<threading->n_core; i++)
-      ((complex_float **)threading->workspace)[0][c] += ((complex_float **)threading->workspace)[i][c];
-  END_MASTER(threading)
-  // all threads need the result of the norm
-  SYNC_MASTER_TO_ALL(threading)
-  for(int c=0; c<count; c++)
-    results[c] = ((complex_float **)threading->workspace)[0][c];
-
-  PROF_float_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading );
-}
-#endif
-
-#ifdef OPTIMIZED_LINALG_double
-void process_multi_inner_product_double( int count, complex_double *results, vector_double *phi, vector_double psi,
-    int start, int end, level_struct *l, struct Thread *threading ) {
-
-  PROF_double_START( _PIP, threading );
-  int i;
-  for(int c=0; c<count; c++)
-    results[c] = 0.0;
-
-  int thread_start;
-  int thread_end;
-  
-  SYNC_CORES(threading)
-
-  if ( l->depth == 0 ) {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12);
-    for(int c=0; c<count; c++) {
-      __m128d result_re = _mm_setzero_pd();
-      __m128d result_im = _mm_setzero_pd();
-      for ( i=thread_start; i<thread_end; ) {
-        FOR6(
-          {
-            __m128d phi_re; __m128d phi_im;
-            __m128d pdi_re; __m128d pdi_im;
-            
-            // deinterleave complex numbers into 4 real parts and 4 imag parts        
-            sse_complex_deinterleaved_load_pd( (double*)(phi[c]+i), &phi_re, &phi_im );
-            sse_complex_deinterleaved_load_pd( (double*)(psi+i), &pdi_re, &pdi_im );
-
-            cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im);
-            i+=SIMD_LENGTH_double;
-          }
-        )
-      }
-      results[c] += sse_reduce_add_pd(result_re) + I*sse_reduce_add_pd(result_im);
-    }
-  } else {
-    compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 2);
-    for(int c=0; c<count; c++) {
-      __m128d result_re = _mm_setzero_pd();
-      __m128d result_im = _mm_setzero_pd();
-      for ( i=thread_start; i<thread_end; i+=SIMD_LENGTH_double ) {
-        __m128d phi_re; __m128d phi_im;
-        __m128d pdi_re; __m128d pdi_im;
-        
-        // deinterleave complex numbers into 4 real parts and 4 imag parts        
-        sse_complex_deinterleaved_load_pd( (double*)(phi[c]+i), &phi_re, &phi_im );
-        sse_complex_deinterleaved_load_pd( (double*)(psi+i), &pdi_re, &pdi_im );
-
-        cfmadd_conj_pd(phi_re, phi_im, pdi_re, pdi_im, &result_re, &result_im);
-      }
-      results[c] += sse_reduce_add_pd(result_re) + I*sse_reduce_add_pd(result_im);
-    }
-  }
-  
-  START_NO_HYPERTHREADS(threading)
-  ((complex_double **)threading->workspace)[threading->core] = results;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int c=0; c<count; c++)
-    for(int i=1; i<threading->n_core; i++)
-      ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c];
-  END_MASTER(threading)
-  // all threads need the result of the norm
-  SYNC_MASTER_TO_ALL(threading)
-  for(int c=0; c<count; c++)
-    results[c] = ((complex_double **)threading->workspace)[0][c];
-
-  PROF_double_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading );
-}
-#endif
-
-#endif // SSE
-
diff --git a/src/sse_linalg.h b/src/sse_linalg.h
deleted file mode 100644
index cd88fad..0000000
--- a/src/sse_linalg.h
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef LINALG_SSE_H
-#define LINALG_SSE_H
-#ifdef SSE
-
-
-// Standard Gram-Schmidt on aggregates
-static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec,
-    level_struct *l, struct Thread *threading );
-// Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt
-static inline void sse_aggregate_gram_schmidt_block_float( float *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-// used by Block-Gram-Schmidt
-static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-// used by Block-Gram-Schmidt
-static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-
-static inline void sse_aggregate_gram_schmidt_double( complex_double *V, const int num_vec,
-    level_struct *l, struct Thread *threading ) {}
-static inline void sse_aggregate_gram_schmidt_block_double( double *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {}
-static inline void sse_aggregate_block_dot_block_double( double *S, double *U, double *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {}
-static inline void sse_aggregate_block_minus_block_times_dot_double( double *B, double *U, double *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {}
-
-
-static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-
-  PROF_float_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading );
-  SYNC_CORES(threading)
-  SYNC_HYPERTHREADS(threading)
-  long int i, j, k, k1, k2, k3, num_aggregates = l->s_float.num_aggregates,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
-      
-  float *v_pt1;
-  float *v_pt2;
-  float norm1, norm2;
-  float next_norm1;
-  float next_norm2;
-  int ldv = SIMD_LENGTH_float;
-  int V_block_offset = 2*l->vector_size; 
-  
-  for ( j=threading->n_thread*threading->core+threading->thread; j<num_aggregates; j+=threading->n_thread*threading->n_core ) {
-
-    v_pt1 = (float *)V + 0 + j*aggregate_size*2*ldv;
-
-    next_norm1 = 0.0;
-    next_norm2 = 0.0;
-    for ( i=0; i<aggregate_size; ) {
-      for ( k=0; k<offset; k++, i++ ) {
-        float *tmp = v_pt1 + i*2*ldv;
-        next_norm1 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-      }
-      for ( k=0; k<offset; k++, i++ ) {
-        float *tmp = v_pt1 + i*2*ldv;
-        next_norm2 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-      }
-    }
-    for ( k1=0; k1<num_vec; k1++ ) {
-      v_pt1 = (float *)V + (k1/ldv)*V_block_offset*ldv + k1%ldv + j*aggregate_size*2*ldv;
-      v_pt2 = (float *)V + j*aggregate_size*2*ldv;
-
-      norm1 = 1.0/sqrt(next_norm1);
-      norm2 = 1.0/sqrt(next_norm2);
-      next_norm1 = 0.0;
-      next_norm2 = 0.0;
-
-      __m128 alpha1_re[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 alpha1_im[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 alpha2_re[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 alpha2_im[OPERATOR_COMPONENT_OFFSET_float/SIMD_LENGTH_float];
-      __m128 v1_re;
-      __m128 v1_im;
-      __m128 v2_re;
-      __m128 v2_im;
-
-      for ( k2=0; k2<num_vec; k2+=SIMD_LENGTH_float ) {
-        alpha1_re[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-        alpha1_im[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-        alpha2_re[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-        alpha2_im[k2/SIMD_LENGTH_float] = _mm_setzero_ps();
-      }
-      for ( i=0; i<aggregate_size; ) {
-        // normalize v1 by scaling with previously computed factor
-        // this is fused into this dotp loop, to avoid loading everything twice
-        for ( k=0; k<offset; k++, i++ ) {
-          float *tmp = v_pt1 + i*2*ldv;
-          tmp[0]   *= norm1;
-          tmp[ldv] *= norm1;
-        }
-        for ( k=0; k<offset; k++, i++ ) {
-          float *tmp = v_pt1 + i*2*ldv;
-          tmp[0]   *= norm2;
-          tmp[ldv] *= norm2;
-        }
-        i -= 2*offset;
-        // done normalizing
-
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          for ( k2=0; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            cfmadd_conj(v1_re, v1_im, v2_re, v2_im, &alpha1_re[k2/SIMD_LENGTH_float], &alpha1_im[k2/SIMD_LENGTH_float]);
-          }
-        }
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          for ( k2=0; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            cfmadd_conj(v1_re, v1_im, v2_re, v2_im, &alpha2_re[k2/SIMD_LENGTH_float], &alpha2_im[k2/SIMD_LENGTH_float]);
-          }
-        }
-      }
-
-      for ( i=0; i<aggregate_size; ) {
-        for ( k=0; k<offset; k++, i++ ) {
-          
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-
-          
-          for ( k2=(k1/SIMD_LENGTH_float)*SIMD_LENGTH_float; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            
-            if(k2 < k1+1) {
-              float mask[SIMD_LENGTH_float] __attribute__((aligned(sizeof(float)*SIMD_LENGTH_float)));
-              memset( mask, 255, sizeof(float)*SIMD_LENGTH_float );
-              
-              // emulate storing mask
-              for ( k3=k2; k3<MIN(k1+1,k2+SIMD_LENGTH_float); k3++ ) {
-                memset( mask+k3-k2, 0, sizeof(float) );
-              }
-              
-              __m128 maskreg = _mm_load_ps(mask);
-              
-              masked_cfnmadd(alpha1_re[k2/SIMD_LENGTH_float], alpha1_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im, maskreg );
-            } else {
-              cfnmadd(alpha1_re[k2/SIMD_LENGTH_float], alpha1_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im);
-            }
-             
-            _mm_store_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset, v2_re);
-            _mm_store_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset, v2_im);
-            
-          }
-        }
-        for ( k=0; k<offset; k++, i++ ) {
-          
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          
-          for ( k2=(k1/SIMD_LENGTH_float)*SIMD_LENGTH_float; k2<OPERATOR_COMPONENT_OFFSET_float; k2+=SIMD_LENGTH_float ) {
-            
-            v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset);
-            v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset);
-            
-            if(k2 < k1+1) {
-              float mask[SIMD_LENGTH_float] __attribute__((aligned(sizeof(float)*SIMD_LENGTH_float)));
-              memset( mask, 255, sizeof(float)*SIMD_LENGTH_float );
-          
-              // emulate storing mask
-              for ( k3=k2; k3<MIN(k1+1,k2+SIMD_LENGTH_float); k3++ ) {
-                memset( mask+k3-k2, 0, sizeof(float) );
-              }         
-              
-              __m128 maskreg = _mm_load_ps(mask);
-              
-              masked_cfnmadd(alpha2_re[k2/SIMD_LENGTH_float], alpha2_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im, maskreg );
-            } else {
-              cfnmadd(alpha2_re[k2/SIMD_LENGTH_float], alpha2_im[k2/SIMD_LENGTH_float], v1_re, v1_im, &v2_re, &v2_im);
-            }
-             
-            _mm_store_ps(v_pt2 + (2*i+0)*ldv + k2*V_block_offset, v2_re);
-            _mm_store_ps(v_pt2 + (2*i+1)*ldv + k2*V_block_offset, v2_im);
-            
-          }
-        }
-        // compute norm of v_{k1+1}
-        // this is fused into this axpy loop, to avoid loading everything twice
-        if ( k1+1<num_vec ) {
-          float *v_pt = (float *)V + ((k1+1)/ldv)*V_block_offset*ldv + (k1+1)%ldv + j*aggregate_size*2*ldv;
-          i -= 2*offset;
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt + i*2*ldv;
-            next_norm1 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-          }
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt + i*2*ldv;
-            next_norm2 += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-          }
-        }
-        // end compute norm
-      }
-    }
-  }
-  
-  SYNC_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-  PROF_float_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading );
-}
-
-static inline void sse_aggregate_gram_schmidt_block_float( float *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-
-  START_NO_HYPERTHREADS(threading)
-  SYNC_CORES(threading)
-  int i, j, k, k1, k2, num_aggregates = l->s_float.num_aggregates,
-      aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2;
-
-  float *v_pt1;
-  float *v_pt2;
-  float norm;
-  float next_norm;
-  int ldv = leading_dimension;
-  //offset = 6;
-
-
-  // current thread chooses an aggregate
-  for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) {
-    j = jp/2;
-    int component = jp%2;
-
-
-    v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv;
-
-    next_norm = 0.0;
-
-    // for the whole aggregate
-    for ( i=0; i<aggregate_size; ) {
-
-      // for either the first or the second half of variables
-      // (depending on the value of "component")
-      for ( k=0; k<offset; k++, i++ ) {
-        // data layout contains ldv real parts
-        // and thereafter ldv imag parts
-        float *tmp = v_pt1 + i*2*ldv;
-        // adds square of real part and square of imaginary part to current norm
-        next_norm += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-      }
-      // skip the other half of variables
-      i += offset;
-    } // i loop
-
-    // for all test vectors
-    for ( k1=0; k1<num_vec; k1++ ) {
-      // v_pt1 =  [ component*offset*number of test vectors 
-      //          + current test vector index
-      //          + current aggregate block of test vectors ] * complex
-      //
-      // means: current vector
-      v_pt1 = V + 2*component*offset*ldv + k1 + j*aggregate_size*2*ldv;
-      // v_pt2 =  [ component*offset*number of test vectors
-      //          + current aggregate block of test vectors ] * complex
-      //
-      // means: first vector
-      v_pt2 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv;
-
-      norm = 1.0/sqrt(next_norm);
-      next_norm = 0.0;
-
-      __m128 alpha_re;
-      __m128 alpha_im;
-      __m128 v1_re;
-      __m128 v1_im;
-      __m128 v2_re;
-      __m128 v2_im;
-
-      alpha_re = _mm_setzero_ps();
-      alpha_im = _mm_setzero_ps();
-      for ( i=0; i<aggregate_size; ) {
-        // normalize v1 by scaling with previously computed factor
-        // this is fused into this dotp loop, to avoid loading everything twice
-        {
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt1 + i*2*ldv;
-            tmp[0]   *= norm;
-            tmp[ldv] *= norm;
-          }
-          i += offset;
-          i -= 2*offset;
-        }
-        // done normalizing current vector
-
-        // calculate inner product of v_pt1 and v_pt2
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv);
-          v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv);
-          cfmadd_conj(v1_re, v1_im, v2_re, v2_im, &alpha_re, &alpha_im);
-        }
-        i += offset;
-      } // i loop
-
-      if(k1 == num_vec-1)
-        break; // break k1 loop
-
-      for ( i=0; i<aggregate_size; ) {
-        for ( k=0; k<offset; k++, i++ ) {
-          v1_re = _mm_set1_ps(v_pt1[(2*i+0)*ldv]);
-          v1_im = _mm_set1_ps(v_pt1[(2*i+1)*ldv]);
-          
-          float buffer[SIMD_LENGTH_float] __attribute__((aligned(sizeof(float)*SIMD_LENGTH_float)));
-          
-          _mm_store_ps( buffer, alpha_re );
-          for ( k2=0; k2<MIN(k1+1,SIMD_LENGTH_float); k2++ )
-            buffer[k2]=0.0;
-          alpha_re = _mm_load_ps(buffer);
-          
-          _mm_store_ps( buffer, alpha_im );
-          for ( k2=0; k2<MIN(k1+1,SIMD_LENGTH_float); k2++ )
-            buffer[k2]=0.0;
-          alpha_im = _mm_load_ps(buffer);
-          
-          v2_re = _mm_load_ps(v_pt2 + (2*i+0)*ldv);
-          v2_im = _mm_load_ps(v_pt2 + (2*i+1)*ldv);
-          cfnmadd(alpha_re, alpha_im, v1_re, v1_im, &v2_re, &v2_im);
-          _mm_store_ps(v_pt2 + (2*i+0)*ldv, v2_re);
-          _mm_store_ps(v_pt2 + (2*i+1)*ldv, v2_im);
-        }
-        i += offset;
-        // compute norm of v_{k1+1}
-        // this is fused into this axpy loop, to avoid loading everything twice
-        {
-          float *v_pt = V + 2*component*offset*ldv + k1+1 + j*aggregate_size*2*ldv;
-          i -= 2*offset;
-          for ( k=0; k<offset; k++, i++ ) {
-            float *tmp = v_pt + i*2*ldv;
-            next_norm += tmp[0]*tmp[0] + tmp[ldv]*tmp[ldv];
-          }
-          i += offset;
-        }
-          // end compute norm
-      } // i loop
-    } // k1 loop
-  } // j loop
-  SYNC_CORES(threading)
-  END_NO_HYPERTHREADS(threading)
-}
-
-static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-
-  START_NO_HYPERTHREADS(threading)
-
-  // we compute S = U^dagger B
-  // U has 16 columns, B has num_vec <= 16 columns
-  // for vectorization it is more efficient to transpose the MM product:
-  // S^T = B^T U^*
-
-  float *Up;
-  float *Bp;
-
-  // factor 2 is for counting spin01 and spin23 aggregates separately
-  int num_aggregates = 2*l->s_float.num_aggregates;
-  int aggregate_size = l->inner_vector_size / num_aggregates;
-  int offset = l->num_lattice_site_var/2;
-
-  for ( int jp=threading->core; jp<num_aggregates; jp+=threading->n_core ) {
-    int j = jp/2;
-    int component = jp%2;
-    // factors 2 are for complex and spin01/23 aggregates
-    Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    __m128 U_re;
-    __m128 U_im;
-    __m128 B_re;
-    __m128 B_im;
-    __m128 S_re[SIMD_LENGTH_float];
-    __m128 S_im[SIMD_LENGTH_float];
-    for( int i=0; i<SIMD_LENGTH_float; i++) {
-      S_re[i] = _mm_setzero_ps();
-      S_im[i] = _mm_setzero_ps();
-    }
-    for ( int i=0; i<aggregate_size; i+=offset ) {
-      for ( int k=0; k<offset; k++ ) {
-        U_re = _mm_load_ps(Up);
-        U_im = _mm_load_ps(Up + leading_dimension);
-        for ( int vec=0; vec<num_vec; vec++ ) {
-          B_re = _mm_set1_ps(Bp[vec]);
-          B_im = _mm_set1_ps(Bp[vec + leading_dimension]);
-          cfmadd_conj(U_re, U_im, B_re, B_im, S_re + vec, S_im + vec);
-        }
-        Bp += 2*leading_dimension;
-        Up += 2*leading_dimension;
-      }
-      Bp += 2*leading_dimension*offset;
-      Up += 2*leading_dimension*offset;
-    }
-    for( int i=0; i<SIMD_LENGTH_float; i++) {
-      _mm_store_ps(S+(2*(SIMD_LENGTH_float*jp+i)+0)*SIMD_LENGTH_float, S_re[i]);
-      _mm_store_ps(S+(2*(SIMD_LENGTH_float*jp+i)+1)*SIMD_LENGTH_float, S_im[i]);
-    }
-    // this stored S^T in row-major format == S in column major
-  }
-
-  END_NO_HYPERTHREADS(threading)
-}
-
-static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-
-  START_NO_HYPERTHREADS(threading)
-
-  // we compute B -= U S
-  // U has 16 columns, B has num_vec <= 16
-
-  float *Up;
-  float *Bp;
-
-  // factor 2 is for counting spin01 and spin23 aggregates separately
-  int num_aggregates = 2*l->s_float.num_aggregates;
-  int aggregate_size = l->inner_vector_size / num_aggregates;
-  int offset = l->num_lattice_site_var/2;
-
-  for ( int jp=threading->core; jp<num_aggregates; jp+=threading->n_core ) {
-    int j = jp/2;
-    int component = jp%2;
-    // factors 2 are for complex and spin01/23 aggregates
-    Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension;
-    __m128 U_re;
-    __m128 U_im;
-    __m128 B_re;
-    __m128 B_im;
-    __m128 S_re[SIMD_LENGTH_float];
-    __m128 S_im[SIMD_LENGTH_float];
-    for( int i=0; i<SIMD_LENGTH_float; i++) {
-      S_re[i] = _mm_load_ps(S+(2*(SIMD_LENGTH_float*jp+i)+0)*SIMD_LENGTH_float);
-      S_im[i] = _mm_load_ps(S+(2*(SIMD_LENGTH_float*jp+i)+1)*SIMD_LENGTH_float);
-    }
-    for ( int i=0; i<aggregate_size; i+=offset ) {
-      for ( int k=0; k<offset; k++ ) {
-        U_re = _mm_load_ps(Up);
-        U_im = _mm_load_ps(Up + leading_dimension);
-        for ( int vec=0; vec<num_vec; vec++ ) {
-          cmul(U_re, U_im, S_re[vec], S_im[vec], &B_re, &B_im);
-                    
-          // horizontal add and subtract from Bp
-          __m128 tmp1;
-          __m128 tmp2;
-          
-          tmp1 = _mm_add_ps( B_re, _mm_movehl_ps( B_re, B_re ) );
-          B_re = _mm_add_ss( tmp1, _mm_shuffle_ps( tmp1, tmp1, 1 ) );
-            
-          tmp1 = _mm_add_ps( B_im, _mm_movehl_ps( B_im, B_im ) );
-          B_im = _mm_add_ss( tmp1, _mm_shuffle_ps( tmp1, tmp1, 1 ) );
-          
-          tmp1 = _mm_set1_ps(Bp[vec]);
-          tmp2 = _mm_set1_ps(Bp[vec + leading_dimension]);
-          B_re = _mm_sub_ps(B_re, tmp1);
-          B_im = _mm_sub_ps(B_im, tmp2);
-          
-          _mm_store_ss( Bp+vec, B_re );
-          _mm_store_ss( Bp+vec+leading_dimension, B_im );
-        }
-        Bp += 2*leading_dimension;
-        Up += 2*leading_dimension;
-      }
-      Bp += 2*leading_dimension*offset;
-      Up += 2*leading_dimension*offset;
-    }
-  }
-
-  END_NO_HYPERTHREADS(threading)
-}
-
-#endif // SSE
-#endif
diff --git a/src/sse_linalg_generic.c b/src/sse_linalg_generic.c
deleted file mode 100644
index b913b0b..0000000
--- a/src/sse_linalg_generic.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#include "sse_complex_float_intrinsic.h"
-#include "sse_float_intrinsic.h"
-#include "sse_linalg.h"
-
-void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_gram_schmidt_PRECISION( V, num_vec, l, threading );
-}
-
-
-void aggregate_gram_schmidt_block_PRECISION( PRECISION *V,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_gram_schmidt_block_PRECISION( V, num_vec, leading_dimension, l, threading );
-}
-
-
-void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-
-  PROF_PRECISION_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading );
-  SYNC_CORES(threading)
-  for ( int i=0; i<num_vec; i+=SIMD_LENGTH_PRECISION ) {
-
-    int vecs = SIMD_LENGTH_PRECISION;
-    if(num_vec-i < SIMD_LENGTH_PRECISION)
-      vecs = num_vec-i;
-
-    for ( int j=0; j<i; j+=SIMD_LENGTH_PRECISION )
-      aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( (PRECISION *)(V + i*l->vector_size),
-          (PRECISION *)(V + j*l->vector_size), vecs, l, threading );
-    aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading );
-  }
-  SYNC_CORES(threading)
-  PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading );
-}
-
-
-void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) {
-
-  // the block version has some optimizations which are correct only on the fine grid
-  if(l->depth == 0)
-    aggregate_block_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading);
-  else
-    aggregate_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading);
-}
-
-
-void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, int num_vec, level_struct *l, struct Thread *threading ) {
-  START_NO_HYPERTHREADS(threading)
-
-  PRECISION *S = NULL;
-  START_LOCKED_MASTER(threading)
-  // factors 2 are for complex and spin01/23 aggregates
-  MALLOC_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64);
-  ((PRECISION **)threading->workspace)[0] = S;
-  END_LOCKED_MASTER(threading)
-  S = ((PRECISION **)threading->workspace)[0];
-
-  aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading);
-  aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading);
-
-  START_LOCKED_MASTER(threading)
-  FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION);
-  END_LOCKED_MASTER(threading)
-
-  END_NO_HYPERTHREADS(threading)
-}
-
-
-void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_block_dot_block_PRECISION( S, U, B, num_vec, leading_dimension, l, threading );
-}
-
-
-void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S,
-    int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {
-    sse_aggregate_block_minus_block_times_dot_PRECISION( B, U, S, num_vec, leading_dimension, l, threading );
-}
-
-#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION
-void setup_gram_schmidt_PRECISION_compute_dots(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading) {
-
-  int thread_start;
-  int thread_end;
-  int cache_block_size = 12*16;
-
-  for(int i=0; i<2*offset; i++)
-    thread_buffer[i] = 0.0;
-
-  SYNC_CORES(threading)
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
-
-  __m128 dot_re[count];
-  __m128 dot_im[count];
-  __m128 dot_gamma5_re[count];
-  __m128 dot_gamma5_im[count];
-
-  for ( int j=0; j<count; j++) {
-    dot_re[j] = _mm_setzero_ps();
-    dot_im[j] = _mm_setzero_ps();
-    dot_gamma5_re[j] = _mm_setzero_ps();
-    dot_gamma5_im[j] = _mm_setzero_ps();
-  }
-
-  for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    for ( int j=0; j<count; j++ ) {
-      for ( int k=0; k<cache_block_size; k+=12) {
-        __m128 vj_re;
-        __m128 vj_im;
-        __m128 v_re;
-        __m128 v_im;
-        __m128 gamma5_v_re;
-        __m128 gamma5_v_im;
-
-        // gamma5 multiplies the first 6 out of 12 components with -1
-        // SIMD_LENGTH is 4, so the pattern repeats after 12 elements = 3 cachelines
-        // => can use 3 pre-defined +/-1 patterns
-        __m128 gamma5[3];
-        gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 );
-        gamma5[1] = _mm_set_ps(  1.0, 1.0,-1.0,-1.0 );
-        gamma5[2] = _mm_set_ps(  1.0, 1.0, 1.0, 1.0 );
-
-        for(int m=0; m<3; m++) {
-          
-          sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im  );
-          sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im  );
-
-          gamma5_v_re = _mm_mul_ps(gamma5[m], v_re);
-          gamma5_v_im = _mm_mul_ps(gamma5[m], v_im);
-
-          cfmadd_conj(vj_re, vj_im, v_re, v_im, dot_re+j, dot_im+j);
-          cfmadd_conj(vj_re, vj_im, gamma5_v_re, gamma5_v_im, dot_gamma5_re+j, dot_gamma5_im+j);
-        }
-      }
-    }
-  }
-  for ( int j=0; j<count; j++ ) {
-    thread_buffer[j]        = sse_reduce_add_ps(dot_re[j]) + I * sse_reduce_add_ps(dot_im[j]);
-    thread_buffer[j+offset] = sse_reduce_add_ps(dot_gamma5_re[j]) + I * sse_reduce_add_ps(dot_gamma5_im[j]);
-  }
-
-  START_NO_HYPERTHREADS(threading)
-  ((complex_PRECISION **)threading->workspace)[threading->core] = thread_buffer;
-  END_NO_HYPERTHREADS(threading)
-  // master sums up all results
-  SYNC_CORES(threading)
-  START_MASTER(threading)
-  for(int i=1; i<threading->n_core; i++) {
-    for(int j=0; j<count; j++) {
-      ((complex_PRECISION **)threading->workspace)[0][j]        += ((complex_PRECISION **)threading->workspace)[i][j];
-      ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset];
-    }
-  }
-  END_MASTER(threading)
-  // only master needs the result in this case (it will be distributed later)
-}
-#endif
-
-#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION
-void setup_gram_schmidt_PRECISION_axpys(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading) {
-
-  int thread_start;
-  int thread_end;
-  int cache_block_size = 12*16;
-
-  compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size);
-
-  __m128 dot_re[count];
-  __m128 dot_im[count];
-  __m128 dot_gamma5_re[count];
-  __m128 dot_gamma5_im[count];
-
-  for ( int j=0; j<count; j++) {
-    dot_re[j] = _mm_set1_ps(creal(thread_buffer[2*offset+j]));
-    dot_im[j] = _mm_set1_ps(cimag(thread_buffer[2*offset+j]));
-    dot_gamma5_re[j] = _mm_set1_ps(creal(thread_buffer[3*offset+j]));
-    dot_gamma5_im[j] = _mm_set1_ps(cimag(thread_buffer[3*offset+j]));
-  }
-
-  for ( int i=thread_start; i<thread_end; i+=cache_block_size) {
-    for ( int j=0; j<count; j++ ) {
-      for ( int k=0; k<cache_block_size; k+=12) {
-        __m128 vj_re;
-        __m128 vj_im;
-        __m128 gamma5_vj_re;
-        __m128 gamma5_vj_im;
-        __m128 v_re;
-        __m128 v_im;
-
-        // gamma5 multiplies the first 6 out of 12 components with -1
-        // SIMD_LENGTH is 4, so the pattern repeats after 12 elements = 3 cachelines
-        // => can use 3 pre-defined +/-1 patterns
-        __m128 gamma5[3];
-        gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 );
-        gamma5[1] = _mm_set_ps(  1.0, 1.0,-1.0,-1.0 );
-        gamma5[2] = _mm_set_ps(  1.0, 1.0, 1.0, 1.0 );
-
-        for(int m=0; m<3; m++) {
-          
-          sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im  );
-          sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im  );
-
-          gamma5_vj_re = _mm_mul_ps(gamma5[m], vj_re);
-          gamma5_vj_im = _mm_mul_ps(gamma5[m], vj_im);
-
-          cfnmadd(vj_re, vj_im, dot_re[j], dot_im[j], &v_re, &v_im);
-          cfnmadd(gamma5_vj_re, gamma5_vj_im, dot_gamma5_re[j], dot_gamma5_im[j], &v_re, &v_im);
-
-          sse_complex_interleaved_store(v_re, v_im, (float*)(V[count]+i+k+4*m) ); 
-        }
-      }
-    }
-  }
-}
-#endif
-
-#endif
diff --git a/src/sse_linalg_generic.h b/src/sse_linalg_generic.h
deleted file mode 100644
index 00390d5..0000000
--- a/src/sse_linalg_generic.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_LINALG_PRECISION_HEADER
-  #define SSE_LINALG_PRECISION_HEADER
-  #ifdef SSE
-  
-  void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  // Block-Gram-Schmidt on aggregates
-  void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  // Standard Gram-Schmidt on aggregates
-  void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading );
-  
-  // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt
-  void aggregate_gram_schmidt_block_PRECISION( PRECISION *V,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U,
-      int num_vec, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  // used by Block-Gram-Schmidt
-  void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S,
-      int num_vec, int leading_dimension, level_struct *l, struct Thread *threading );
-  
-  void setup_gram_schmidt_PRECISION_compute_dots(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading);
-  
-  void setup_gram_schmidt_PRECISION_axpys(
-    complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset,
-    int start, int end, level_struct *l, struct Thread *threading);
-  
-#endif
-#endif
\ No newline at end of file
diff --git a/src/sse_oddeven_generic.c b/src/sse_oddeven_generic.c
deleted file mode 100644
index 724d2ee..0000000
--- a/src/sse_oddeven_generic.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
- 
-#include "main.h"
-
-#ifdef SSE
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op,
-                             const int amount, level_struct *l, struct Thread *threading ) {
-  
-  int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites,
-      *neighbor = op->neighbor_table, start=0, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM;
-  
-  SYNC_CORES(threading)  
-  
-  if ( amount == _EVEN_SITES || amount == _ODD_SITES ) {
-    compute_core_start_end_custom(0, op->num_even_sites, &start_even, &end_even, l, threading, 1 );
-    compute_core_start_end_custom(op->num_even_sites, op->num_even_sites+op->num_odd_sites, &start_odd, &end_odd, l, threading, 1 );
-  } else {
-    compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &n, l, threading, 1 );
-  }
-  
-  if ( amount == _EVEN_SITES ) {
-    start = start_odd, n = end_odd;
-    minus_dir_param = _ODD_SITES;
-    plus_dir_param = _EVEN_SITES;
-  } else if ( amount == _ODD_SITES ) {
-    start = start_even, n = end_even;
-    minus_dir_param = _EVEN_SITES;
-    plus_dir_param = _ODD_SITES;
-  }
-  
-  complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX };
-  complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX };
-  
-  // project minus dir
-  prp_PRECISION( prn, phi, 12*start, 12*n );  
-  
-  // start communication in negative direction
-  START_LOCKED_MASTER(threading)
-  ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
-  END_LOCKED_MASTER(threading)
-  
-  // project plus dir and multiply with U dagger
-  prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n );
-
-  if ( amount == _EVEN_SITES ) {
-    start = start_even, n = end_even;
-  } else if ( amount == _ODD_SITES ) {
-    start = start_odd, n = end_odd;
-  }  
-  // start communication in positive direction
-  START_LOCKED_MASTER(threading)
-  ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
-  ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
-  // wait for communication in negative direction
-  ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l );
-  ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l );
-  ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l );
-  ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l );
-  END_LOCKED_MASTER(threading) 
-  
-  // multiply with U and lift up minus dir
-  su3_pbp_PRECISION( eta, prn, op, neighbor, 12*start, 12*n );
-  
-  // wait for communication in positive direction
-  START_LOCKED_MASTER(threading)
-  ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l );
-  ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l );
-  ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l );
-  ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l );
-  END_LOCKED_MASTER(threading)
-  
-  // lift up plus dir
-  pbn_PRECISION( eta, prp, 12*start, 12*n );
-
-  SYNC_CORES(threading)
-}
-#endif
-
-// ---- block odd even ---------------------------------------------------
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struct *l ) {
-  
-  PRECISION *clover_pt = op->clover_vectorized, *oe_clover_pt = op->oe_clover_vectorized;
-  int mu, i, d0, c0, b0, a0, d1, c1, b1, a1, t, z, y, x, agg_split[4], block_split[4], block_size[4];
-
-  if ( g.csw ) {
-    for ( mu=0; mu<4; mu++ ) {
-      agg_split[mu] = l->local_lattice[mu]/l->coarsening[mu];
-      block_split[mu] = l->coarsening[mu]/l->block_lattice[mu];
-      block_size[mu] = l->block_lattice[mu];
-    }
-    
-    for ( d0=0; d0<agg_split[T]; d0++ )
-      for ( c0=0; c0<agg_split[Z]; c0++ )
-        for ( b0=0; b0<agg_split[Y]; b0++ )
-          for ( a0=0; a0<agg_split[X]; a0++ )
-            
-            for ( d1=d0*block_split[T]; d1<(d0+1)*block_split[T]; d1++ )
-              for ( c1=c0*block_split[Z]; c1<(c0+1)*block_split[Z]; c1++ )
-                for ( b1=b0*block_split[Y]; b1<(b0+1)*block_split[Y]; b1++ )
-                  for ( a1=a0*block_split[X]; a1<(a0+1)*block_split[X]; a1++ ) {
-                    
-                    for ( t=d1*block_size[T]; t<(d1+1)*block_size[T]; t++ )
-                      for ( z=c1*block_size[Z]; z<(c1+1)*block_size[Z]; z++ )
-                        for ( y=b1*block_size[Y]; y<(b1+1)*block_size[Y]; y++ )
-                          for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
-                            if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
-                                (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 0 ) {
-                              for ( i=0; i<144; i++ )
-                                oe_clover_pt[i] = clover_pt[i];
-                              clover_pt += 144;
-                              oe_clover_pt += 144;
-                            }
-                          }
-                    for ( t=d1*block_size[T]; t<(d1+1)*block_size[T]; t++ )
-                      for ( z=c1*block_size[Z]; z<(c1+1)*block_size[Z]; z++ )
-                        for ( y=b1*block_size[Y]; y<(b1+1)*block_size[Y]; y++ )
-                          for ( x=a1*block_size[X]; x<(a1+1)*block_size[X]; x++ ) {
-                            if (((t-d1*block_size[T])+(z-c1*block_size[Z])+
-                                (y-b1*block_size[Y])+(x-a1*block_size[X]))%2 == 1 ) {
-                              sse_site_clover_invert_PRECISION( clover_pt, oe_clover_pt );
-                              clover_pt += 144;
-                              oe_clover_pt += 144;
-                            }
-                          }
-                  }
-  } else {
-    vector_PRECISION_copy( op->oe_clover, op->clover, 0, l->inner_vector_size, l );
-#ifdef HAVE_TM
-    vector_PRECISION_plus( op->oe_clover, op->oe_clover, op->tm_term, 0, l->inner_vector_size, l );
-#endif
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-     int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
-  
-  START_UNTHREADED_FUNCTION(threading)
-  PRECISION *clover_vectorized = s->op.oe_clover_vectorized + (start/12)*144;  
-  int i, n1 = s->num_block_even_sites;
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  // diagonal blocks applied to the even sites of a block
-  if ( g.csw ) {
-    for ( i=0; i<n1; i++ ) {
-      sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover_vectorized );
-      leta+=12; lphi+=12; clover_vectorized+=144;
-    }
-  } else {
-    for ( i=0; i<12*n1; i++ )
-      leta[i] = lphi[i]*clover[i];
-  }
-
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-    int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
-  
-  START_UNTHREADED_FUNCTION(threading)
-
-  int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites;
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  // diagonal blocks applied to the odd sites of a block
-  if ( g.csw ) {
-    error0("block_diag_oo_PRECISION is not available when using SSE\n");
-  } else {
-    leta += n1*12; lphi += n1*12; clover += n1*12;
-    for ( i=0; i<12*n2; i++ )
-      leta[i] = lphi[i]*clover[i];
-  }
-
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi,
-    int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
-
-  START_UNTHREADED_FUNCTION(threading)
-  PRECISION *clover_vectorized = s->op.oe_clover_vectorized + (start/12)*144;
-  int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites;
-  config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42;
-  vector_PRECISION lphi = phi+start, leta = eta+start;
-  // inverted diagonal blocks applied to the odd sites of a block
-  if ( g.csw ) {
-    leta += n1*12; lphi += n1*12; clover_vectorized += n1*144;
-    for ( i=0; i<n2; i++ ) {
-      sse_site_clover_PRECISION( (PRECISION*)leta, (PRECISION*)lphi, clover_vectorized );
-      leta+=12; lphi+=12; clover_vectorized+=144;
-    }
-  } else {
-    leta += n1*12; lphi += n1*12; clover += n1*12;
-    for ( i=0; i<12*n2; i++ )
-      leta[i] = lphi[i]/clover[i];
-  }
-
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, 
-                                   int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
-  
-  START_UNTHREADED_FUNCTION(threading)
-  
-  int *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
-      **index = s->oe_index, *neighbor = s->op.neighbor_table;
-  PRECISION *Dplus = s->op.D_vectorized + (start/12)*96;
-  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/12)*96;
-
-  for ( int mu=0; mu<4; mu++ ) {
-    int a1, a2, n1, n2;
-    if ( amount == _EVEN_SITES ) {
-      a1 = 0; n1 = length_even[mu];
-      a2 = n1; n2 = a2 + length_odd[mu];
-    } else if ( amount == _ODD_SITES ) {
-      a1 = length_even[mu]; n1 = a1 + length_odd[mu];
-      a2 = 0; n2 = a1;
-    } else {
-      a1 = 0; n1 = length_even[mu]+length_odd[mu];
-      a2 = 0; n2 = n1;
-    }
-    block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start),
-                                           mu, a1, n1, index[mu], neighbor );
-    block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start),
-                                            mu, a2, n2, index[mu], neighbor );
-  }
-  
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, 
-                                   int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) {
-  
-  START_UNTHREADED_FUNCTION(threading)
-  
-  int *length_even = s->dir_length_even, *length_odd = s->dir_length_odd,
-      **index = s->oe_index, *neighbor = s->op.neighbor_table;
-  PRECISION *Dplus = s->op.D_vectorized + (start/12)*96;
-  PRECISION *Dminus = s->op.D_transformed_vectorized + (start/12)*96;
-
-  for ( int mu=0; mu<4; mu++ ) {
-    int a1, a2, n1, n2;
-    if ( amount == _EVEN_SITES ) {
-      a1 = 0; n1 = length_even[mu];
-      a2 = n1; n2 = a2 + length_odd[mu];
-    } else if ( amount == _ODD_SITES ) {
-      a1 = length_even[mu]; n1 = a1 + length_odd[mu];
-      a2 = 0; n2 = a1;
-    } else {
-      a1 = 0; n1 = length_even[mu]+length_odd[mu];
-      a2 = 0; n2 = n1;
-    }
-    block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start),
-                                            mu, a1, n1, index[mu], neighbor );
-    block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start),
-                                             mu, a2, n2, index[mu], neighbor );
-  }
-  
-  END_UNTHREADED_FUNCTION(threading)
-}
-#endif
-
-
-#endif // SSE
-
diff --git a/src/sse_schwarz_generic.c b/src/sse_schwarz_generic.c
deleted file mode 100644
index 9ef98d7..0000000
--- a/src/sse_schwarz_generic.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#include "main.h"
-
-#ifdef SSE
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
-                                  schwarz_PRECISION_struct *s, level_struct *l ) {
-  int *bbl = s->block_boundary_length;
-  PRECISION *Dplus = s->op.D_vectorized;
-  PRECISION *Dminus = s->op.D_transformed_vectorized;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi,
-                                              mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL );
-    boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi,
-                                               mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL );
-  }
-}
-#endif
-
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k,
-                                    schwarz_PRECISION_struct *s, level_struct *l ) {
-  int *bbl = s->block_boundary_length;
-  PRECISION *Dplus = s->op.D_vectorized;
-  PRECISION *Dminus = s->op.D_transformed_vectorized;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi,
-                                               mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL );
-    boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi,
-                                                mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL );
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
-                                         int k, schwarz_PRECISION_struct *s, level_struct *l ) {
-  // k: number of current block
-  int *bbl = s->block_boundary_length, n = l->num_lattice_site_var;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset;
-    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset;
-    // plus mu direction
-    for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l );
-    }
-    // minus mu direction
-    for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l );
-    }
-  }
-}
-#endif
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi,
-                                           int k, schwarz_PRECISION_struct *s, level_struct *l ) {
-  // k: number of current block
-  int *bbl = s->block_boundary_length, n = l->num_lattice_site_var;
-  int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION);
-  int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset;
-  
-  for ( int mu=0; mu<4; mu++ ) {
-    OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset;
-    OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset;
-    // plus mu direction
-    for ( int i=bbl[2*mu]; i<bbl[2*mu+1]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l );
-    }
-    // minus mu direction
-    for ( int i=bbl[2*mu+1]; i<bbl[2*mu+2]; i+=2 ) {
-      int index = s->block[k].bt[i];
-      int neighbor_index = s->block[k].bt[i+1];
-      vector_PRECISION phi_pt = phi + n*neighbor_index;
-      vector_PRECISION eta_pt = eta + n*index;
-      coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l );
-    }
-  }
-}
-#endif
-
-#if defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) || defined(OPTIMIZED_SELF_COUPLING_PRECISION)
-void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struct *op_in, level_struct *l ) {
-
-/*********************************************************************************  
-* Copies the Dirac operator and the clover term from op_in into the Schwarz 
-* struct (this function is depth 0 only).
-* - operator_double_struct *op_in: Input operator.                                  
-*********************************************************************************/
-
-  int i, index, n = l->num_inner_lattice_sites, *tt = s->op.translation_table;
-  config_PRECISION D_out_pt, clover_out_pt;
-  config_double D_in_pt = op_in->D, clover_in_pt = op_in->clover;
-#ifdef HAVE_TM
-  config_PRECISION tm_term_out_pt, odd_proj_out_pt;
-  config_double tm_term_in_pt = op_in->tm_term, odd_proj_in_pt = op_in->odd_proj;
-#endif
-  
-  for ( i=0; i<n; i++ ) {
-    index = tt[i];
-    D_out_pt = s->op.D + 36*index;
-    FOR36( *D_out_pt = (complex_PRECISION) *D_in_pt; D_out_pt++; D_in_pt++; )
-  }
-  
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  for ( i=0; i<n; i++ ) {
-    PRECISION *D_vectorized = s->op.D_vectorized + 96*i;
-    PRECISION *D_transformed_vectorized = s->op.D_transformed_vectorized + 96*i;
-    complex_PRECISION *D_out_pt = s->op.D + 36*i;
-    for ( int mu=0; mu<4; mu++ ) {
-      set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu );
-    }
-  }
-#endif
-  
-  if ( g.csw != 0 ) {
-    for ( i=0; i<n; i++ ) {
-      index = tt[i];
-      clover_out_pt = s->op.clover + 42*index;
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-      PRECISION *clover_out_vectorized_pt = s->op.clover_vectorized + 144*index;
-      sse_set_clover_PRECISION( clover_out_vectorized_pt, clover_in_pt );
-#endif
-      FOR42( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; )
-    }
-  } else {
-    for ( i=0; i<n; i++ ) {
-      index = tt[i];
-      clover_out_pt = s->op.clover + 12*index;
-      FOR12( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; )
-    }
-  }
-  
-#ifdef HAVE_TM
-  for ( i=0; i<n; i++ ) {
-    index = tt[i];
-    tm_term_out_pt = s->op.tm_term + 12*index;
-#ifdef OPTIMIZED_SELF_COUPLING_PRECISION
-    if ( g.csw != 0 ) {
-      PRECISION *clover_out_vectorized_pt = s->op.clover_vectorized + 144*index;
-      sse_add_diagonal_clover_PRECISION( clover_out_vectorized_pt, tm_term_in_pt );
-    }
-#endif
-    FOR12( *tm_term_out_pt = (complex_PRECISION) *tm_term_in_pt; tm_term_out_pt++; tm_term_in_pt++; )
-  }
-
-  for ( i=0; i<n; i++ ) {
-    index = tt[i];
-    odd_proj_out_pt = s->op.odd_proj + 12*index;
-    FOR12( *odd_proj_out_pt = (complex_PRECISION) *odd_proj_in_pt; odd_proj_out_pt++; odd_proj_in_pt++; )
-  }
-#endif
-
-  if ( g.odd_even )
-    schwarz_PRECISION_oddeven_setup( &(s->op), l );
-  
-  schwarz_PRECISION_boundary_update( s, l );
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-  int start = l->num_lattice_sites;
-  int end = 2*l->num_lattice_sites - l->num_inner_lattice_sites;
-  for ( i=start; i<end; i++ ) {
-    PRECISION *D_vectorized = s->op.D_vectorized + 96*i;
-    PRECISION *D_transformed_vectorized = s->op.D_transformed_vectorized + 96*i;
-    complex_PRECISION *D_out_pt = s->op.D + 36*i;
-    for ( int mu=0; mu<4; mu++ ) {
-      set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu );
-    }
-  }
-#endif
-}
-#endif
-
-#endif // SSE
diff --git a/src/sse_schwarz_generic.h b/src/sse_schwarz_generic.h
deleted file mode 100644
index 5bd6218..0000000
--- a/src/sse_schwarz_generic.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder.
- * 
- * This file is part of the DDalphaAMG solver library.
- * 
- * The DDalphaAMG solver library is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * The DDalphaAMG solver library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * 
- * You should have received a copy of the GNU General Public License
- * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/.
- * 
- */
-
-#ifndef SSE_SCHWARZ_PRECISION_H
-#define SSE_SCHWARZ_PRECISION_H
-#ifdef SSE
-
-#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION
-static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) {
-  // out1: column major, out2: row major
-  for ( int i=0; i<3; i++ ) { // column
-    for ( int j=0; j<3; j++ ) { // row
-      out1[8*i  +j] = creal_PRECISION(in[3*j+i]);
-      out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]);
-      out2[8*i  +j] = creal_PRECISION(in[j+3*i]);
-      out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]);
-    }
-    out1[8*i+3] = 0.0;
-    out1[8*i+7] = 0.0;
-    out2[8*i+3] = 0.0;
-    out2[8*i+7] = 0.0;
-  }
-}
-#endif
-
-#endif // SSE
-#endif 
diff --git a/src/threading.c b/src/threading.c
index b78fa1f..e5d3f2c 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -31,7 +31,9 @@ void no_hyperthread_barrier(void *barrier, int id)
 }
 void core_barrier(int core)
 {
+#ifdef OPENMP
 #pragma omp barrier
+#endif
 }
 void hyperthread_barrier(void *barrier, int hyperthead)
 {
diff --git a/src/threading.h b/src/threading.h
index 7aafaed..f2e1742 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -43,11 +43,11 @@
 // nested omp: split into cores, each core splits into hyperthreads (like DD preconditioner)
 #define CORE_BARRIER(threading) \
     do { \
-        threading->barrier(threading->core); \
+      threading->barrier(threading->core);      \
     } while(0)
 #define HYPERTHREAD_BARRIER(threading) \
     do { \
-    threading->thread_barrier(threading->thread_barrier_data, threading->thread); \
+      threading->thread_barrier(threading->thread_barrier_data, threading->thread); \
     } while(0)
 
 #endif
@@ -77,8 +77,10 @@
     if(threading->thread == 0) \
         CORE_BARRIER(threading);
 
+#define MASTER(threading) \
+    if(threading->core + threading->thread == 0)
 #define START_MASTER(threading) \
-    if(threading->core + threading->thread == 0) {
+  MASTER(threading) {
 #define END_MASTER(threading) \
     }
 
@@ -101,7 +103,10 @@
 
 #ifdef OPENMP
 #include <omp.h>
+#define DO_PRAGMA(EXP) _Pragma (#EXP)
+#define THREADED(EXP) DO_PRAGMA ( omp parallel num_threads( EXP ) )
 #else
+#define THREADED(EXP)
 static inline int omp_get_thread_num( void ) {
   return 0;
 }
@@ -112,7 +117,7 @@ static inline int omp_get_num_threads( void ) {
 
 struct level_struct;
 
-struct common_thread_data
+typedef struct common_thread_data
 {
     // barrier among cores
     void (*barrier)(int);
@@ -121,7 +126,7 @@ struct common_thread_data
     // *common* workspace for *all* threads
     // sometimes threads need to exchange data, they can use this
     char *workspace;
-};
+} common_thread_data;
 
 void init_common_thread_data(struct common_thread_data *common);
 
diff --git a/src/top_level.c b/src/top_level.c
index 1a135b0..354f170 100644
--- a/src/top_level.c
+++ b/src/top_level.c
@@ -27,16 +27,13 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading )
   if(threading->thread != 0)
     return;
 
-  int start = threading->start_index[l->depth];
-  int end = threading->end_index[l->depth];
-
   if ( g.rhs == 0 ) {
-    vector_double_define( rhs, 1, start, end, l );
+    vector_double_define_real( rhs, 1, 0, l->inner_vector_size, l, threading );
     START_MASTER(threading)
     if ( g.print > 0 ) printf0("rhs = ones\n");
     END_MASTER(threading)
   } else if ( g.rhs == 1 )  {
-    vector_double_define( rhs, 0, start, end, l );
+    vector_double_define_zero( rhs, 0, l->inner_vector_size, l, threading );
     if ( g.my_rank == 0 ) {
       START_LOCKED_MASTER(threading)
       rhs[0] = 1.0;
@@ -47,17 +44,16 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading )
     END_MASTER(threading)
   } else if ( g.rhs == 2 ) {
     // this would yield different results if we threaded it, so we don't
-    START_LOCKED_MASTER(threading)
-    vector_double_define_random( rhs, 0, l->inner_vector_size, l );
-    END_LOCKED_MASTER(threading)
+    vector_double_define_random( rhs, 0, l->inner_vector_size, l, threading );
     START_MASTER(threading)
     if ( g.print > 0 ) printf0("rhs = random\n");
     END_MASTER(threading)
   } else if ( g.rhs == 3 ) {
-    vector_double_define( rhs, 0, start, end, l );
+    vector_double_define_zero( rhs, 0, l->inner_vector_size, l, threading );
   } else {
     ASSERT( g.rhs >= 0 && g.rhs <= 4 );
   }
+    
 }
 
 
@@ -65,8 +61,8 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l
   
   int iter = 0, start = threading->start_index[l->depth], end = threading->end_index[l->depth];
   
-  vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b;
-  vector_double sol = g.mixed_precision==2?g.p_MP.dp.x:g.p.x;
+  vector_double rhs = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.b:g.p.b;
+  vector_double sol = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.x:g.p.x;
 
 #ifdef WILSON_BENCHMARK
   START_MASTER(threading)
@@ -106,12 +102,11 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l
 
 void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) {
   
-  vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b;
-
   if ( g.vt.evaluation ) {
+    vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b;
     // this would yield different results if we threaded it, so we don't
+    vector_double_define_random( rhs, 0, l->inner_vector_size, l, threading );
     START_LOCKED_MASTER(threading)
-    vector_double_define_random( rhs, 0, l->inner_vector_size, l );
     scan_var( &(g.vt), l );
     END_LOCKED_MASTER(threading)
   } else {
@@ -123,24 +118,34 @@ void solve( vector_double solution, vector_double source, level_struct *l, struc
 void solve_driver( level_struct *l, struct Thread *threading ) {
   
   vector_double solution = NULL, source = NULL;
-  double minus_twisted_bc[4];
-  
-  START_LOCKED_MASTER(threading)
+  double minus_twisted_bc[4], norm;
+ 
   if(g.bc==2)
     for ( int i=0; i<4; i++ )
-      minus_twisted_bc[i] = g.twisted_bc[i];
-  END_LOCKED_MASTER(threading)
+      minus_twisted_bc[i] = -1*g.twisted_bc[i];
   
+#ifdef HAVE_TM1p1
+  if( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) { 
+    data_layout_n_flavours( 2, l, threading );
+    printf0("inverting doublet operator\n");
+  }
+#endif
   PUBLIC_MALLOC( solution, complex_double, l->inner_vector_size );
   PUBLIC_MALLOC( source, complex_double, l->inner_vector_size );
-  
+
   rhs_define( source, l, threading );
 
   if(g.bc==2)
     apply_twisted_bc_to_vector_double( source, source, g.twisted_bc, l);
 
+  norm = global_norm_double( source, 0, l->inner_vector_size, l, threading );
+  printf0("source vector norm: %le\n",norm);
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 1 )
+#endif
 #ifdef HAVE_TM
-  if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 )
+  if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 )
     if(g.downprop) {
       
       START_MASTER(threading)  
@@ -150,16 +155,17 @@ void solve_driver( level_struct *l, struct Thread *threading ) {
       solve( solution, source, l, threading );    
       
       if(g.bc==2)
-	apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l);
+     apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l);
       
       START_LOCKED_MASTER(threading)  
       printf0("\n\n+-------------------------- down --------------------------+\n\n");
-      g.tm_mu*=-1;
-      g.tm_mu_odd_shift*=-1;
-      g.tm_mu_even_shift*=-1;
+      g.mu*=-1;
+      g.mu_odd_shift*=-1;
+      g.mu_even_shift*=-1;
       END_LOCKED_MASTER(threading)
-	
-      optimized_shift_update( l->dirac_shift, l, threading );
+  
+      tm_term_update( g.mu, l, threading );
+      finalize_operator_update( l, threading );
     } 
 #endif
 
@@ -167,8 +173,16 @@ void solve_driver( level_struct *l, struct Thread *threading ) {
 
   if(g.bc==2)
     apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l);
-  
+
+  norm = global_norm_double( solution, 0, l->inner_vector_size, l, threading );
+  printf0("solution vector norm: %le\n",norm);
+
   PUBLIC_FREE( solution, complex_double, l->inner_vector_size );
   PUBLIC_FREE( source, complex_double, l->inner_vector_size );
+
+#ifdef HAVE_TM1p1
+  if( g.n_flavours == 2 ) 
+    data_layout_n_flavours( 1, l, threading );
+#endif
 }
 
diff --git a/src/var_table.h b/src/var_table.h
index 53e766e..8066522 100644
--- a/src/var_table.h
+++ b/src/var_table.h
@@ -54,9 +54,11 @@
       for ( int i=0; i<g.vt.average_over; i++ ) { \
         g.vt.p_end->values[_TRCKD_VAL] = *tmp_var; \
         parameter_update( l ); \
-        if ( g.vt.shift_update ) \
-          shift_update( *tmp_var, l, no_threading ); \
-        if ( g.vt.re_setup ) { \
+        if ( g.vt.shift_update ) {                  \
+          m0_update( *tmp_var, l, no_threading ); \
+          g.m0 = *tmp_var; \
+          } \
+          if ( g.vt.re_setup ) {         \
           double t0, t1; \
           t0 = MPI_Wtime(); \
           method_re_setup( l, no_threading ); \
@@ -67,20 +69,20 @@
         printf0("scanning variable \"%s\", value: %lf, run %d of %d\n", name, (double)(*tmp_var), i+1, g.vt.average_over ); \
         if ( g.vt.track_error ) { \
           apply_operator_double( b, v, &(g.p), l, no_threading ); \
-          vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+          vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading );  \
           if ( g.vt.track_cgn_error ) { \
             ASSERT( g.method >=0 && g.p.restart_length >= 4 ); \
-            vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+            vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading );     \
             cgn_double( &(g.p), l, no_threading ); \
             vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \
             g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \
             printf0("CGN: error norm: %le\n", g.vt.p_end->values[_CGNR_ERR] ); \
-            vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+            vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading );  \
             } \
         } else {\
           rhs_define( b, l, no_threading );\
         } \
-        vector_double_define( x, 0, 0, l->inner_vector_size, l ); \
+        vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading );    \
         if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \
         else fgmres_double( &(g.p), l, no_threading ); \
         if ( i == g.vt.average_over-1 ) prof_print( l ); \
diff --git a/src/vcycle_generic.c b/src/vcycle_generic.c
index be46be6..e20e094 100644
--- a/src/vcycle_generic.c
+++ b/src/vcycle_generic.c
@@ -23,7 +23,7 @@
 #include "vcycle_PRECISION.h"
 
 void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
-                         int n, const int res, complex_PRECISION shift, level_struct *l, struct Thread *threading ) {
+                         int n, const int res, level_struct *l, struct Thread *threading ) {
   
   ASSERT( phi != eta );
 
@@ -41,7 +41,6 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE
     int start = threading->start_index[l->depth];
     int end   = threading->end_index[l->depth];
     START_LOCKED_MASTER(threading)
-    l->sp_PRECISION.shift = shift;
     l->sp_PRECISION.initial_guess_zero = res;
     l->sp_PRECISION.num_restart = n;
     END_LOCKED_MASTER(threading)
@@ -128,14 +127,15 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI
           g.coarse_time += MPI_Wtime();
         END_MASTER(threading)
       }
-      if( i == 0 && res == _NO_RES )
-        interpolate3_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading );
-      else
+      if( i == 0 && res == _NO_RES ) {        
+        vector_PRECISION_define_zero( phi, 0, l->inner_vector_size, l, threading );
         interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading );
-      smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, _NO_SHIFT, l, threading );
+      } else
+        interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading );
+      smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, l, threading );
       res = _RES;
     }
   } else {
-    smoother_PRECISION( phi, Dphi, eta, (l->depth==0)?l->n_cy:l->post_smooth_iter, res, _NO_SHIFT, l, threading );
+    smoother_PRECISION( phi, Dphi, eta, (l->depth==0)?l->n_cy:l->post_smooth_iter, res, l, threading );
   }
 }
diff --git a/src/vcycle_generic.h b/src/vcycle_generic.h
index d8a5033..5e54a74 100644
--- a/src/vcycle_generic.h
+++ b/src/vcycle_generic.h
@@ -33,7 +33,7 @@
   #include "solver_analysis.h"
 
   void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
-                           int n, const int res, complex_PRECISION shift, level_struct *l, struct Thread *threading );
+                           int n, const int res, level_struct *l, struct Thread *threading );
     
   void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta,
                          int res, level_struct *l, struct Thread *threading );
diff --git a/src/vectorization_control.h b/src/vectorization_control.h
index 772063b..abd5f3e 100644
--- a/src/vectorization_control.h
+++ b/src/vectorization_control.h
@@ -24,18 +24,16 @@
 
 #ifdef SSE
 
-#define SIMD_LENGTH_float 4
+#define SIMD_LENGTH_float  4
 #define SIMD_LENGTH_double 2
 
-#ifndef HAVE_TM // TODO: make it work for TM
-#define INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_float
-#define INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_float
-#endif
-#define VECTORIZE_COARSE_OPERATOR_float
-#define GRAM_SCHMIDT_VECTORIZED_float
+#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float
+#define OPTIMIZED_COARSE_SELF_COUPLING_float
+#define OPTIMIZED_INTERPOLATION_OPERATOR_float
+#define OPTIMIZED_INTERPOLATION_SETUP_float
+#define OPTIMIZED_NEIGHBOR_COUPLING_double
 #define OPTIMIZED_NEIGHBOR_COUPLING_float
 #define OPTIMIZED_SELF_COUPLING_float
-#define OPTIMIZED_NEIGHBOR_COUPLING_double
 #define OPTIMIZED_LINALG_float
 #define OPTIMIZED_LINALG_double
 
diff --git a/src/vectorization_dirac_generic.c b/src/vectorization_dirac_generic.c
index a07919a..9ea2b3e 100644
--- a/src/vectorization_dirac_generic.c
+++ b/src/vectorization_dirac_generic.c
@@ -40,8 +40,8 @@ void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, comp
   config_PRECISION D = s->op.D;
 
   // add clover term/shift
-  spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+l->dirac_shift, offset );
-  spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+l->dirac_shift, offset );
+  spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset );
+  spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset );
 
   index_out = site;
 
@@ -102,3 +102,15 @@ void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex
   twospin2_p_PRECISION_vectorized_simd_length( eta1, eta2, buffer, mu );
 }
 #endif
+
+#ifdef SSE
+void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
+    complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
+    int site ) {
+
+  int offset = SIMD_LENGTH_PRECISION;
+  int site_offset = 12*offset;
+
+  sse_diagonal_aggregate_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+12*site, offset );
+}
+#endif
diff --git a/src/vectorization_dirac_generic.h b/src/vectorization_dirac_generic.h
index 3831b5d..5b8f02c 100644
--- a/src/vectorization_dirac_generic.h
+++ b/src/vectorization_dirac_generic.h
@@ -35,6 +35,10 @@
     complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l,
     int site, int *direction_flags );
 
+  void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2,
+                                              complex_PRECISION *phi, schwarz_PRECISION_struct *s,
+                                              level_struct *l, int site );
+
   // spinors are vectorized, gauge is same for all (use for multiple rhs)
   static inline void mvm_PRECISION_vectorized_simd_length(
       const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi ) {
@@ -129,4 +133,4 @@
 #endif
   }
   
-#endif
\ No newline at end of file
+#endif